diff --git a/packages/CLI11/.github/actions/cmake_config/Dockerfile b/packages/CLI11/.github/actions/cmake_config/Dockerfile
deleted file mode 100644
index 63b28a9949de219993f26a48d85db4afe6bce6df..0000000000000000000000000000000000000000
--- a/packages/CLI11/.github/actions/cmake_config/Dockerfile
+++ /dev/null
@@ -1,16 +0,0 @@
-FROM ubuntu:18.04
-
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
-        g++=4:7.4.0-1ubuntu2.3 \
-        wget=1.19.4-1ubuntu2.2 \
-        libidn11=1.33-2.1ubuntu1.2 \
-        ca-certificates=20180409 \
-        make=4.1-9.1ubuntu1 \
-        git=1:2.17.1-1ubuntu0.7 \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/*
-
-COPY entrypoint.sh /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
diff --git a/packages/CLI11/.github/actions/cmake_config/action.yml b/packages/CLI11/.github/actions/cmake_config/action.yml
deleted file mode 100644
index 73ff1661b5a6fe2840339996cf88671bc1053b67..0000000000000000000000000000000000000000
--- a/packages/CLI11/.github/actions/cmake_config/action.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-description: 'Test out a bare bones configuration with a CMake version'
-inputs:
-  version:
-    description: 'The full version of CMake to check'
-    required: true
-  options:
-    description: 'The CMake configuration options'
-    required: false
-    default: ""
-name: 'Configure with CMake'
-runs:
-  using: 'docker'
-  image: 'Dockerfile'
-  args:
-    - ${{ inputs.version }}
-    - ${{ inputs.options }}
diff --git a/packages/CLI11/.github/actions/cmake_config/entrypoint.sh b/packages/CLI11/.github/actions/cmake_config/entrypoint.sh
deleted file mode 100755
index e3bd622e1575375a746dc9425a7012b1777a79f1..0000000000000000000000000000000000000000
--- a/packages/CLI11/.github/actions/cmake_config/entrypoint.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash -l
-
-set -ex
-
-mkdir -p cmake_dir
-mkdir -p build_tmp
-mkdir -p cmake_sources
-rm -rf cmake_dir/* build_tmp/*
-
-v=$1
-fn=cmake-$v-Linux-x86_64.tar.gz
-
-if [ ! -f cmake_sources/$fn ]; then
-    wget -qO cmake_sources/$fn "https://cmake.org/files/v${v%.*}/$fn"
-fi
-
-tar -xzf cmake_sources/$fn --strip-components=1 -C $PWD/cmake_dir
-
-export PATH=$PWD/cmake_dir/bin:$PATH
-
-cmake --version
-
-cd build_tmp && cmake .. $2
diff --git a/packages/CLI11/.github/actions/quick_cmake/action.yml b/packages/CLI11/.github/actions/quick_cmake/action.yml
new file mode 100644
index 0000000000000000000000000000000000000000..da721a78c32e275b7f8c93e7a8457b1d009925fc
--- /dev/null
+++ b/packages/CLI11/.github/actions/quick_cmake/action.yml
@@ -0,0 +1,18 @@
+name: Quick CMake config
+description: 'Runs CMake 3.4+ (if already setup)'
+inputs:
+  args:
+    description: 'Other arguments'
+    required: false
+    default: ''
+
+runs:
+  using: composite
+  steps:
+    - run: |
+        mkdir -p build-tmp
+        touch build-tmp/tmp
+        rm -r build-tmp/*
+        (cd build-tmp && cmake .. ${{ inputs.args}})
+        rm -r build-tmp
+      shell: bash
diff --git a/packages/CLI11/.github/workflows/tests.yml b/packages/CLI11/.github/workflows/tests.yml
index 9fcfc705b2860f8ff85464eae2562ae90229f123..60f10f9623eaf0218344a63d60737eea92d5ace2 100644
--- a/packages/CLI11/.github/workflows/tests.yml
+++ b/packages/CLI11/.github/workflows/tests.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
-    - uses: pre-commit/action@v2.0.0
+    - uses: pre-commit/action@v2.0.2
 
   cuda-build:
     name: CUDA build only
@@ -28,7 +28,7 @@ jobs:
     - name: Add wget
       run: apt-get update && apt-get install -y wget
     - name: Setup cmake
-      uses: jwlawson/actions-setup-cmake@v1.7
+      uses: jwlawson/actions-setup-cmake@v1.8
     - name: Configure
       run: cmake -S . -B build -DCLI11_CUDA_TESTS=ON
     - name: Build
@@ -39,81 +39,144 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-      with:
-        submodules: true
+
     - name: CMake 3.4
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.4.3
+        cmake-version: "3.4"
+    - name: Check CMake 3.4
+      uses: ./.github/actions/quick_cmake
+
     - name: CMake 3.5
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.5.2
+        cmake-version: "3.5"
+    - name: Check CMake 3.5
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.6
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.6.3
+        cmake-version: "3.6"
+    - name: Check CMake 3.6
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.7
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.7.2
+        cmake-version: "3.7"
+    - name: Check CMake 3.7
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.8
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.8.2
+        cmake-version: "3.8"
+    - name: Check CMake 3.8
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.9
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.9.6
+        cmake-version: "3.9"
+    - name: Check CMake 3.9
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.10
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.10.3
+        cmake-version: "3.10"
+    - name: Check CMake 3.10
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
-    - name: CMake 3.11 (full)
-      uses: ./.github/actions/cmake_config
+
+    - name: CMake 3.11
+      uses: jwlawson/actions-setup-cmake@v1.8
+      with:
+        cmake-version: "3.11"
+    - name: Check CMake 3.11 (full)
+      uses: ./.github/actions/quick_cmake
       with:
-        version: 3.11.4
-        options: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
+        args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
       if: success() || failure()
+
     - name: CMake 3.12
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.12.4
+        cmake-version: "3.12"
+    - name: Check CMake 3.12
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.13
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.13.5
+        cmake-version: "3.13"
+    - name: Check CMake 3.13
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.14
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.14.7
+        cmake-version: "3.14"
+    - name: Check CMake 3.14
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.15
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.15.6
+        cmake-version: "3.15"
+    - name: Check CMake 3.15
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.16
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.16.8
+        cmake-version: "3.16"
+    - name: Check CMake 3.16
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
     - name: CMake 3.17
-      uses: ./.github/actions/cmake_config
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.17.3
+        cmake-version: "3.17"
+    - name: Check CMake 3.17
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
-    - name: CMake 3.18 (full)
-      uses: ./.github/actions/cmake_config
+
+    - name: CMake 3.18
+      uses: jwlawson/actions-setup-cmake@v1.8
       with:
-        version: 3.18.0
-        options: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
+        cmake-version: "3.18"
+    - name: Check CMake 3.18
+      uses: ./.github/actions/quick_cmake
       if: success() || failure()
+
+    - name: CMake 3.19
+      uses: jwlawson/actions-setup-cmake@v1.8
+      with:
+        cmake-version: "3.19"
+    - name: Check CMake 3.19 (full)
+      uses: ./.github/actions/quick_cmake
+      with:
+        args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
+      if: success() || failure()
+
+    - name: CMake 3.20
+      uses: jwlawson/actions-setup-cmake@v1.8
+      with:
+        cmake-version: "3.20"
+    - name: Check CMake 3.20
+      uses: ./.github/actions/quick_cmake
+      if: success() || failure()
+
+
diff --git a/packages/CLI11/.gitignore b/packages/CLI11/.gitignore
index 2a6ef59a35af60bd3b7ae6ec675f2b045fd8dfbb..cc1b9d0c7f77776f258bfccbe82224fe588e9582 100644
--- a/packages/CLI11/.gitignore
+++ b/packages/CLI11/.gitignore
@@ -12,3 +12,4 @@ a.out*
 /node_modules/*
 /package.json
 /yarn.lock
+/CLI11.hpp
diff --git a/packages/CLI11/.gitmodules b/packages/CLI11/.gitmodules
deleted file mode 100644
index 6051b7f20049a412a56aa8f664a56a8e35b0278f..0000000000000000000000000000000000000000
--- a/packages/CLI11/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "extern/googletest"]
-	path = extern/googletest
-	url = ../../google/googletest.git
diff --git a/packages/CLI11/.gitrepo b/packages/CLI11/.gitrepo
index e423eb3eead27d0875cc8ea21a30bf8ceec6ed8c..732e03b962d7b98a783a3ae9f93d8ac9232673e5 100644
--- a/packages/CLI11/.gitrepo
+++ b/packages/CLI11/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:CLIUtils/CLI11.git
 	branch = master
-	commit = 639a8add1e248c7337b420ff68572ddb3893e080
-	parent = b7068f18e2c214064a81a5b561d5f04a80d2a847
+	commit = 4af78beef777e313814b4daff70e2da9171a385a
+	parent = 015d2fd5332b347d28c47c8dfe3f401382724178
 	cmdver = 0.4.3
 	method = merge
diff --git a/packages/CLI11/CHANGELOG.md b/packages/CLI11/CHANGELOG.md
index f9cc853b7f3bf554c2743b98de8f6fe53e610617..d2a59b4786a836a07801228e5371e095b42c5e23 100644
--- a/packages/CLI11/CHANGELOG.md
+++ b/packages/CLI11/CHANGELOG.md
@@ -1,14 +1,42 @@
 ## Version 2.0: In progress
 
-* Built-in config format is TOML compliant now [#435]
-* Config updates [#442]
-* More powerful containers, `%%` separator [#423]
-* Add a version flag easily [#452]
+* Built-in config format is TOML compliant now [#435][]
+    * Support multiline TOML [#528][]
+* Support short/positional options in config mode [#443][]
+* More powerful containers, `%%` separator [#423][]
+* Add a version flag easily [#452][]
+* Support atomic types [#520][]
+* Add a type validator `CLI::TypeValidator<TYPE>` [#526][]
+* Support `->silent()` on subcommands. [#529][]
+* Add alias section to help for subcommands [#545][]
+* Redesigned MakeSingleFiles to have a higher level of manual control, to support future features. [#546][]
+* Moved testing from GTest to Catch2 [#574][]
+
+* Bugfix: avoid listing helpall as a required flag [#530][]
+* Bugfix: avoid a clash with WINDOWS define [#563][]
+
+* Removed deprecated set commands, use validators instead. [#565][]
+
+* Build: support pkg-config [#523][]
+
 
 [#435]: https://github.com/CLIUtils/CLI11/pull/435
 [#443]: https://github.com/CLIUtils/CLI11/pull/443
 [#423]: https://github.com/CLIUtils/CLI11/pull/423
 [#452]: https://github.com/CLIUtils/CLI11/pull/452
+[#520]: https://github.com/CLIUtils/CLI11/pull/520
+[#523]: https://github.com/CLIUtils/CLI11/pull/523
+[#526]: https://github.com/CLIUtils/CLI11/pull/526
+[#528]: https://github.com/CLIUtils/CLI11/pull/528
+[#529]: https://github.com/CLIUtils/CLI11/pull/529
+[#530]: https://github.com/CLIUtils/CLI11/pull/530
+[#545]: https://github.com/CLIUtils/CLI11/pull/545
+[#546]: https://github.com/CLIUtils/CLI11/pull/546
+[#563]: https://github.com/CLIUtils/CLI11/pull/563
+[#565]: https://github.com/CLIUtils/CLI11/pull/565
+[#574]: https://github.com/CLIUtils/CLI11/pull/574
+
+
 
 
 ### Version 1.9.1: Backporting fixes
diff --git a/packages/CLI11/CPPLINT.cfg b/packages/CLI11/CPPLINT.cfg
index d497667bbc899dc8056d25387413cd382d7737f9..0a1758da0e5b703d4677bcaf60d171455c4aee66 100644
--- a/packages/CLI11/CPPLINT.cfg
+++ b/packages/CLI11/CPPLINT.cfg
@@ -5,6 +5,8 @@ linelength=120  # As in .clang-format
 filter=-build/c++11  # Reports e.g. chrono and thread, which overlap with Chromium's API. Not applicable to general C++ projects.
 filter=-build/include_order  # Requires unusual include order that encourages creating not self-contained headers
 filter=-readability/nolint  # Conflicts with clang-tidy
+filter=-readability/check  # Catch uses CHECK(a == b) (Tests only)
+filter=-build/namespaces  # Currently using it for one test (Tests only)
 filter=-runtime/references  # Requires fundamental change of API, don't see need for this
 filter=-whitespace/blank_line  # Unnecessarily strict with blank lines that otherwise help with readability
 filter=-whitespace/indent  # Requires strange 3-space indent of private/protected/public markers
diff --git a/packages/CLI11/README.md b/packages/CLI11/README.md
index 8ffb6b25ddd02583763cb18dcd4902ca221a8b63..846d44ec7310e567fa32897088c03d659d1e3b56 100644
--- a/packages/CLI11/README.md
+++ b/packages/CLI11/README.md
@@ -224,7 +224,7 @@ While all options internally are the same type, there are several ways to add an
 app.add_option(option_name, help_str="")
 
 app.add_option(option_name,
-               variable_to_bind_to, // bool, char(see note)๐Ÿšง, int, float, vector, enum, or string-like, or anything with a defined conversion from a string or that takes an int ๐Ÿ†•, double ๐Ÿ†•, or string in a constructor. Also allowed are tuples ๐Ÿ†•, std::array ๐Ÿ†• or std::pair ๐Ÿ†•. Also supported are complex numbers๐Ÿšง, wrapper types๐Ÿšง, and containers besides vector๐Ÿšง of any other supported type.
+               variable_to_bind_to, // bool, char(see note)๐Ÿšง, int, float, vector, enum, std::atomic ๐Ÿšง, or string-like, or anything with a defined conversion from a string or that takes an int ๐Ÿ†•, double ๐Ÿ†•, or string in a constructor. Also allowed are tuples ๐Ÿ†•, std::array ๐Ÿ†• or std::pair ๐Ÿ†•. Also supported are complex numbers๐Ÿšง, wrapper types๐Ÿšง, and containers besides vector๐Ÿšง of any other supported type.
                help_string="")
 
 app.add_option_function<type>(option_name,
@@ -245,7 +245,7 @@ app.add_flag(option_name,
              help_string="")
 
 app.add_flag(option_name,
-             variable_to_bind_to, // bool, int, float, complex, containers, enum, or string-like, or any singular object with a defined conversion from a string like add_option
+             variable_to_bind_to, // bool, int, float, complex, containers, enum, std::atomic ๐Ÿšง, or string-like, or any singular object with a defined conversion from a string like add_option
              help_string="")
 
 app.add_flag_function(option_name,
diff --git a/packages/CLI11/azure-pipelines.yml b/packages/CLI11/azure-pipelines.yml
index 90017c62f8f111acfa4eb95c78f07d2c1453fd9c..c72c748e6f59d6af651e28231b6cbb0d53f8a32d 100644
--- a/packages/CLI11/azure-pipelines.yml
+++ b/packages/CLI11/azure-pipelines.yml
@@ -121,7 +121,7 @@ jobs:
         cli11.std: 17
         cli11.options: -DCLI11_FORCE_LIBCXX=ON
       clang10_20:
-        containerImage: helics/buildenv:clang10-builder
+        containerImage: silkeh/clang:10
         cli11.std: 20
         cli11.options: -DCLI11_FORCE_LIBCXX=ON -DCMAKE_CXX_FLAGS=-std=c++20
   container: $[ variables['containerImage'] ]
diff --git a/packages/CLI11/cmake/AddGoogletest.cmake b/packages/CLI11/cmake/AddGoogletest.cmake
deleted file mode 100644
index ae0dc18fcb96f974148645e2ab2fb0abc8efe564..0000000000000000000000000000000000000000
--- a/packages/CLI11/cmake/AddGoogletest.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-#
-#
-# Includes GTest and provides a helper macro to add tests. Add make check, as well, which
-# gives output on failed tests without having to set an environment variable.
-#
-#
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-set(BUILD_SHARED_LIBS OFF)
-
-add_subdirectory("${CLI11_SOURCE_DIR}/extern/googletest" "${CLI11_BINARY_DIR}/extern/googletest" EXCLUDE_FROM_ALL)
-
-
-if(GOOGLE_TEST_INDIVIDUAL)
-    if(NOT CMAKE_VERSION VERSION_LESS 3.9)
-        include(GoogleTest)
-    else()
-        set(GOOGLE_TEST_INDIVIDUAL OFF)
-    endif()
-endif()
-
-# Target must already exist
-macro(add_gtest TESTNAME)
-    target_link_libraries(${TESTNAME} PUBLIC gtest gmock gtest_main)
-
-    if(GOOGLE_TEST_INDIVIDUAL)
-        if(CMAKE_VERSION VERSION_LESS 3.10)
-            gtest_add_tests(TARGET ${TESTNAME}
-                            TEST_PREFIX "${TESTNAME}."
-                            TEST_LIST TmpTestList)
-            set_tests_properties(${TmpTestList} PROPERTIES FOLDER "Tests")
-        else()
-            gtest_discover_tests(${TESTNAME}
-                TEST_PREFIX "${TESTNAME}."
-                PROPERTIES FOLDER "Tests")
-
-        endif()
-    else()
-        add_test(${TESTNAME} ${TESTNAME})
-        set_target_properties(${TESTNAME} PROPERTIES FOLDER "Tests")
-        if (CLI11_FORCE_LIBCXX)
-           set_property(TARGET ${T} APPEND_STRING
-             PROPERTY LINK_FLAGS -stdlib=libc++)
-         endif()
-    endif()
-
-endmacro()
-
-set_target_properties(gtest gtest_main gmock gmock_main
-    PROPERTIES FOLDER "Extern")
diff --git a/packages/CLI11/extern/googletest b/packages/CLI11/extern/googletest
deleted file mode 160000
index 859bfe8981d6724c4ea06e73d29accd8588f3230..0000000000000000000000000000000000000000
--- a/packages/CLI11/extern/googletest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 859bfe8981d6724c4ea06e73d29accd8588f3230
diff --git a/packages/CLI11/include/CLI/App.hpp b/packages/CLI11/include/CLI/App.hpp
index d3c513bd03de33141eafa41ea176fe742992e69d..edbe2b7386ba7be718762c034fa2c3d3bf87ce9b 100644
--- a/packages/CLI11/include/CLI/App.hpp
+++ b/packages/CLI11/include/CLI/App.hpp
@@ -44,7 +44,7 @@ namespace CLI {
 #endif
 
 namespace detail {
-enum class Classifier { NONE, POSITIONAL_MARK, SHORT, LONG, WINDOWS, SUBCOMMAND, SUBCOMMAND_TERMINATOR };
+enum class Classifier { NONE, POSITIONAL_MARK, SHORT, LONG, WINDOWS_STYLE, SUBCOMMAND, SUBCOMMAND_TERMINATOR };
 struct AppFriend;
 }  // namespace detail
 
@@ -897,56 +897,6 @@ class App {
     }
 #endif
 
-    /// Add set of options (No default, temp reference, such as an inline set) DEPRECATED
-    template <typename T>
-    Option *add_set(std::string option_name,
-                    T &member,            ///< The selected member of the set
-                    std::set<T> options,  ///< The set of possibilities
-                    std::string option_description = "") {
-
-        Option *opt = add_option(option_name, member, std::move(option_description));
-        opt->check(IsMember{options});
-        return opt;
-    }
-
-    /// Add set of options (No default, set can be changed afterwards - do not destroy the set) DEPRECATED
-    template <typename T>
-    Option *add_mutable_set(std::string option_name,
-                            T &member,                   ///< The selected member of the set
-                            const std::set<T> &options,  ///< The set of possibilities
-                            std::string option_description = "") {
-
-        Option *opt = add_option(option_name, member, std::move(option_description));
-        opt->check(IsMember{&options});
-        return opt;
-    }
-
-    /// Add set of options (with default, static set, such as an inline set) DEPRECATED
-    template <typename T>
-    Option *add_set(std::string option_name,
-                    T &member,            ///< The selected member of the set
-                    std::set<T> options,  ///< The set of possibilities
-                    std::string option_description,
-                    bool defaulted) {
-
-        Option *opt = add_option(option_name, member, std::move(option_description), defaulted);
-        opt->check(IsMember{options});
-        return opt;
-    }
-
-    /// Add set of options (with default, set can be changed afterwards - do not destroy the set) DEPRECATED
-    template <typename T>
-    Option *add_mutable_set(std::string option_name,
-                            T &member,                   ///< The selected member of the set
-                            const std::set<T> &options,  ///< The set of possibilities
-                            std::string option_description,
-                            bool defaulted) {
-
-        Option *opt = add_option(option_name, member, std::move(option_description), defaulted);
-        opt->check(IsMember{&options});
-        return opt;
-    }
-
     /// Add a complex number DEPRECATED --use add_option instead
     template <typename T, typename XC = double>
     Option *add_complex(std::string option_name,
@@ -2072,7 +2022,7 @@ class App {
             return detail::Classifier::SHORT;
         }
         if((allow_windows_style_options_) && (detail::split_windows_style(current, dummy1, dummy2)))
-            return detail::Classifier::WINDOWS;
+            return detail::Classifier::WINDOWS_STYLE;
         if((current == "++") && !name_.empty() && parent_ != nullptr)
             return detail::Classifier::SUBCOMMAND_TERMINATOR;
         return detail::Classifier::NONE;
@@ -2525,7 +2475,7 @@ class App {
             break;
         case detail::Classifier::LONG:
         case detail::Classifier::SHORT:
-        case detail::Classifier::WINDOWS:
+        case detail::Classifier::WINDOWS_STYLE:
             // If already parsed a subcommand, don't accept options_
             _parse_arg(args, classifier);
             break;
@@ -2742,7 +2692,7 @@ class App {
             if(!detail::split_short(current, arg_name, rest))
                 throw HorribleError("Short parsed but missing! You should not see this");
             break;
-        case detail::Classifier::WINDOWS:
+        case detail::Classifier::WINDOWS_STYLE:
             if(!detail::split_windows_style(current, arg_name, value))
                 throw HorribleError("windows option parsed but missing! You should not see this");
             break;
@@ -2760,7 +2710,7 @@ class App {
                     return opt->check_lname(arg_name);
                 if(current_type == detail::Classifier::SHORT)
                     return opt->check_sname(arg_name);
-                // this will only get called for detail::Classifier::WINDOWS
+                // this will only get called for detail::Classifier::WINDOWS_STYLE
                 return opt->check_lname(arg_name) || opt->check_sname(arg_name);
             });
 
diff --git a/packages/CLI11/tests/AppTest.cpp b/packages/CLI11/tests/AppTest.cpp
index 89f52c0853354280eee2745920a92b290a55fe4f..6c3e71ced8ee59cbe4409698b86b44b56430a881 100644
--- a/packages/CLI11/tests/AppTest.cpp
+++ b/packages/CLI11/tests/AppTest.cpp
@@ -9,129 +9,127 @@
 #include <cstdint>
 #include <cstdlib>
 
-#include "gmock/gmock.h"
-
-TEST_F(TApp, OneFlagShort) {
+TEST_CASE_METHOD(TApp, "OneFlagShort", "[app]") {
     app.add_flag("-c,--count");
     args = {"-c"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
 }
 
-TEST_F(TApp, OneFlagShortValues) {
+TEST_CASE_METHOD(TApp, "OneFlagShortValues", "[app]") {
     app.add_flag("-c{v1},--count{v2}");
     args = {"-c"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
     auto v = app["-c"]->results();
-    EXPECT_EQ(v[0], "v1");
+    CHECK("v1" == v[0]);
 
-    EXPECT_THROW(app["--invalid"], CLI::OptionNotFound);
+    CHECK_THROWS_AS(app["--invalid"], CLI::OptionNotFound);
 }
 
-TEST_F(TApp, OneFlagShortValuesAs) {
+TEST_CASE_METHOD(TApp, "OneFlagShortValuesAs", "[app]") {
     auto flg = app.add_flag("-c{1},--count{2}");
     args = {"-c"};
     run();
     auto opt = app["-c"];
-    EXPECT_EQ(opt->as<int>(), 1);
+    CHECK(1 == opt->as<int>());
     args = {"--count"};
     run();
-    EXPECT_EQ(opt->as<int>(), 2);
+    CHECK(2 == opt->as<int>());
     flg->take_first();
     args = {"-c", "--count"};
     run();
-    EXPECT_EQ(opt->as<int>(), 1);
+    CHECK(1 == opt->as<int>());
     flg->take_last();
-    EXPECT_EQ(opt->as<int>(), 2);
+    CHECK(2 == opt->as<int>());
     flg->multi_option_policy(CLI::MultiOptionPolicy::Throw);
-    EXPECT_THROW(opt->as<int>(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(opt->as<int>(), CLI::ArgumentMismatch);
     flg->multi_option_policy(CLI::MultiOptionPolicy::TakeAll);
     auto vec = opt->as<std::vector<int>>();
-    EXPECT_EQ(vec[0], 1);
-    EXPECT_EQ(vec[1], 2);
+    CHECK(1 == vec[0]);
+    CHECK(2 == vec[1]);
     flg->multi_option_policy(CLI::MultiOptionPolicy::Join);
-    EXPECT_EQ(opt->as<std::string>(), "1\n2");
+    CHECK("1\n2" == opt->as<std::string>());
     flg->delimiter(',');
-    EXPECT_EQ(opt->as<std::string>(), "1,2");
+    CHECK("1,2" == opt->as<std::string>());
 }
 
-TEST_F(TApp, OneFlagShortWindows) {
+TEST_CASE_METHOD(TApp, "OneFlagShortWindows", "[app]") {
     app.add_flag("-c,--count");
     args = {"/c"};
     app.allow_windows_style_options();
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
 }
 
-TEST_F(TApp, WindowsLongShortMix1) {
+TEST_CASE_METHOD(TApp, "WindowsLongShortMix1", "[app]") {
     app.allow_windows_style_options();
 
     auto a = app.add_flag("-c");
     auto b = app.add_flag("--c");
     args = {"/c"};
     run();
-    EXPECT_EQ(1u, a->count());
-    EXPECT_EQ(0u, b->count());
+    CHECK(a->count() == 1u);
+    CHECK(b->count() == 0u);
 }
 
-TEST_F(TApp, WindowsLongShortMix2) {
+TEST_CASE_METHOD(TApp, "WindowsLongShortMix2", "[app]") {
     app.allow_windows_style_options();
 
     auto a = app.add_flag("--c");
     auto b = app.add_flag("-c");
     args = {"/c"};
     run();
-    EXPECT_EQ(1u, a->count());
-    EXPECT_EQ(0u, b->count());
+    CHECK(a->count() == 1u);
+    CHECK(b->count() == 0u);
 }
 
-TEST_F(TApp, CountNonExist) {
+TEST_CASE_METHOD(TApp, "CountNonExist", "[app]") {
     app.add_flag("-c,--count");
     args = {"-c"};
     run();
-    EXPECT_THROW(app.count("--nonexist"), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.count("--nonexist"), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, OneFlagLong) {
+TEST_CASE_METHOD(TApp, "OneFlagLong", "[app]") {
     app.add_flag("-c,--count");
     args = {"--count"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
 }
 
-TEST_F(TApp, DashedOptions) {
+TEST_CASE_METHOD(TApp, "DashedOptions", "[app]") {
     app.add_flag("-c");
     app.add_flag("--q");
     app.add_flag("--this,--that");
 
     args = {"-c", "--q", "--this", "--that"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--q"));
-    EXPECT_EQ(2u, app.count("--this"));
-    EXPECT_EQ(2u, app.count("--that"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--q") == 1u);
+    CHECK(app.count("--this") == 2u);
+    CHECK(app.count("--that") == 2u);
 }
 
-TEST_F(TApp, DashedOptionsSingleString) {
+TEST_CASE_METHOD(TApp, "DashedOptionsSingleString", "[app]") {
     app.add_flag("-c");
     app.add_flag("--q");
     app.add_flag("--this,--that");
 
     app.parse("-c --q --this --that");
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--q"));
-    EXPECT_EQ(2u, app.count("--this"));
-    EXPECT_EQ(2u, app.count("--that"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--q") == 1u);
+    CHECK(app.count("--this") == 2u);
+    CHECK(app.count("--that") == 2u);
 }
 
-TEST_F(TApp, RequireOptionsError) {
-    using ::testing::HasSubstr;
-    using ::testing::Not;
+TEST_CASE_METHOD(TApp, "RequireOptionsError", "[app]") {
+    using Catch::Matchers::Contains;
+
     app.add_flag("-c");
     app.add_flag("--q");
     app.add_flag("--this,--that");
@@ -141,267 +139,267 @@ TEST_F(TApp, RequireOptionsError) {
     try {
         app.parse("-c --q --this --that");
     } catch(const CLI::RequiredError &re) {
-        EXPECT_THAT(re.what(), Not(HasSubstr("-h,--help")));
-        EXPECT_THAT(re.what(), Not(HasSubstr("help_all")));
+        CHECK_THAT(re.what(), !Contains("-h,--help"));
+        CHECK_THAT(re.what(), !Contains("help_all"));
     }
 
-    EXPECT_NO_THROW(app.parse("-c --q"));
-    EXPECT_NO_THROW(app.parse("-c --this --that"));
+    CHECK_NOTHROW(app.parse("-c --q"));
+    CHECK_NOTHROW(app.parse("-c --this --that"));
 }
 
-TEST_F(TApp, BoolFlagOverride) {
+TEST_CASE_METHOD(TApp, "BoolFlagOverride", "[app]") {
     bool val{false};
     auto flg = app.add_flag("--this,--that", val);
 
     app.parse("--this");
-    EXPECT_TRUE(val);
+    CHECK(val);
     app.parse("--this=false");
-    EXPECT_FALSE(val);
+    CHECK(!val);
     flg->disable_flag_override(true);
     app.parse("--this");
-    EXPECT_TRUE(val);
+    CHECK(val);
     // this is allowed since the matching string is the default
     app.parse("--this=true");
-    EXPECT_TRUE(val);
+    CHECK(val);
 
-    EXPECT_THROW(app.parse("--this=false"), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(app.parse("--this=false"), CLI::ArgumentMismatch);
     // try a string that specifies 'use default val'
-    EXPECT_NO_THROW(app.parse("--this={}"));
+    CHECK_NOTHROW(app.parse("--this={}"));
 }
 
-TEST_F(TApp, OneFlagRef) {
+TEST_CASE_METHOD(TApp, "OneFlagRef", "[app]") {
     int ref{0};
     app.add_flag("-c,--count", ref);
     args = {"--count"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
-    EXPECT_EQ(1, ref);
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
+    CHECK(ref == 1);
 }
 
-TEST_F(TApp, OneFlagRefValue) {
+TEST_CASE_METHOD(TApp, "OneFlagRefValue", "[app]") {
     int ref{0};
     app.add_flag("-c,--count", ref);
     args = {"--count=7"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
-    EXPECT_EQ(7, ref);
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
+    CHECK(ref == 7);
 }
 
-TEST_F(TApp, OneFlagRefValueFalse) {
+TEST_CASE_METHOD(TApp, "OneFlagRefValueFalse", "[app]") {
     int ref{0};
     auto flg = app.add_flag("-c,--count", ref);
     args = {"--count=false"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
-    EXPECT_EQ(-1, ref);
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
+    CHECK(ref == -1);
 
-    EXPECT_FALSE(flg->check_fname("c"));
+    CHECK(!flg->check_fname("c"));
     args = {"--count=0"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
-    EXPECT_EQ(-1, ref);
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
+    CHECK(ref == -1);
 
     args = {"--count=happy"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, FlagNegation) {
+TEST_CASE_METHOD(TApp, "FlagNegation", "[app]") {
     int ref{0};
     auto flg = app.add_flag("-c,--count,--ncount{false}", ref);
     args = {"--count", "-c", "--ncount"};
-    EXPECT_FALSE(flg->check_fname("count"));
-    EXPECT_TRUE(flg->check_fname("ncount"));
+    CHECK(!flg->check_fname("count"));
+    CHECK(flg->check_fname("ncount"));
     run();
-    EXPECT_EQ(3u, app.count("-c"));
-    EXPECT_EQ(3u, app.count("--count"));
-    EXPECT_EQ(3u, app.count("--ncount"));
-    EXPECT_EQ(1, ref);
+    CHECK(app.count("-c") == 3u);
+    CHECK(app.count("--count") == 3u);
+    CHECK(app.count("--ncount") == 3u);
+    CHECK(ref == 1);
 }
 
-TEST_F(TApp, FlagNegationShortcutNotation) {
+TEST_CASE_METHOD(TApp, "FlagNegationShortcutNotation", "[app]") {
     int ref{0};
     app.add_flag("-c,--count{true},!--ncount", ref);
     args = {"--count=TRUE", "-c", "--ncount"};
     run();
-    EXPECT_EQ(3u, app.count("-c"));
-    EXPECT_EQ(3u, app.count("--count"));
-    EXPECT_EQ(3u, app.count("--ncount"));
-    EXPECT_EQ(1, ref);
+    CHECK(app.count("-c") == 3u);
+    CHECK(app.count("--count") == 3u);
+    CHECK(app.count("--ncount") == 3u);
+    CHECK(ref == 1);
 }
 
-TEST_F(TApp, FlagNegationShortcutNotationInvalid) {
+TEST_CASE_METHOD(TApp, "FlagNegationShortcutNotationInvalid", "[app]") {
     int ref{0};
     app.add_flag("-c,--count,!--ncount", ref);
     args = {"--ncount=happy"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, OneString) {
+TEST_CASE_METHOD(TApp, "OneString", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     args = {"--string", "mystring"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringWindowsStyle) {
+TEST_CASE_METHOD(TApp, "OneStringWindowsStyle", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     args = {"/string", "mystring"};
     app.allow_windows_style_options();
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringSingleStringInput) {
+TEST_CASE_METHOD(TApp, "OneStringSingleStringInput", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
 
     app.parse("--string mystring");
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringEqualVersion) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersion", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     args = {"--string=mystring"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringEqualVersionWindowsStyle) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionWindowsStyle", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     args = {"/string:mystring"};
     app.allow_windows_style_options();
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleString) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleString", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     app.parse("--string=mystring");
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuoted) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuoted", "[app]") {
     std::string str;
     app.add_option("-s,--string", str);
     app.parse(R"raw(--string="this is my quoted string")raw");
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "this is my quoted string");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("this is my quoted string" == str);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultiple) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultiple", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.parse(R"raw(--string="this is my quoted string" -t 'qstring 2' -m=`"quoted string"`)raw");
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringEmbeddedEqual) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringEmbeddedEqual", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.parse(R"raw(--string="app=\"test1 b\" test2=\"frogs\"" -t 'qstring 2' -m=`"quoted string"`)raw");
-    EXPECT_EQ(str, "app=\"test1 b\" test2=\"frogs\"");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("app=\"test1 b\" test2=\"frogs\"" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 
     app.parse(R"raw(--string="app='test1 b' test2='frogs'" -t 'qstring 2' -m=`"quoted string"`)raw");
-    EXPECT_EQ(str, "app='test1 b' test2='frogs'");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("app='test1 b' test2='frogs'" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringEmbeddedEqualWindowsStyle) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringEmbeddedEqualWindowsStyle", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("--mstr", str3);
     app.allow_windows_style_options();
     app.parse(R"raw(/string:"app:\"test1 b\" test2:\"frogs\"" /t 'qstring 2' /mstr:`"quoted string"`)raw");
-    EXPECT_EQ(str, "app:\"test1 b\" test2:\"frogs\"");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("app:\"test1 b\" test2:\"frogs\"" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 
     app.parse(R"raw(/string:"app:'test1 b' test2:'frogs'" /t 'qstring 2' /mstr:`"quoted string"`)raw");
-    EXPECT_EQ(str, "app:'test1 b' test2:'frogs'");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("app:'test1 b' test2:'frogs'" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultipleMixedStyle) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultipleMixedStyle", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.allow_windows_style_options();
     app.parse(R"raw(/string:"this is my quoted string" /t 'qstring 2' -m=`"quoted string"`)raw");
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultipleInMiddle) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultipleInMiddle", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.parse(R"raw(--string="this is my quoted string" -t "qst\"ring 2" -m=`"quoted string"`)raw");
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qst\"ring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qst\"ring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedEscapedCharacters) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedEscapedCharacters", "[app]") {
     std::string str, str2, str3;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.parse(R"raw(--string="this is my \"quoted\" string" -t 'qst\'ring 2' -m=`"quoted\` string"`")raw");
-    EXPECT_EQ(str, "this is my \"quoted\" string");
-    EXPECT_EQ(str2, "qst\'ring 2");
-    EXPECT_EQ(str3, "\"quoted` string\"");
+    CHECK("this is my \"quoted\" string" == str);
+    CHECK("qst\'ring 2" == str2);
+    CHECK("\"quoted` string\"" == str3);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultipleWithEqual) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultipleWithEqual", "[app]") {
     std::string str, str2, str3, str4;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
     app.add_option("-m,--mstr", str3);
     app.add_option("-j,--jstr", str4);
     app.parse(R"raw(--string="this is my quoted string" -t 'qstring 2' -m=`"quoted string"` --jstr=Unquoted)raw");
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
-    EXPECT_EQ(str4, "Unquoted");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
+    CHECK("Unquoted" == str4);
 }
 
-TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultipleWithEqualAndProgram) {
+TEST_CASE_METHOD(TApp, "OneStringEqualVersionSingleStringQuotedMultipleWithEqualAndProgram", "[app]") {
     std::string str, str2, str3, str4;
     app.add_option("-s,--string", str);
     app.add_option("-t,--tstr", str2);
@@ -410,78 +408,78 @@ TEST_F(TApp, OneStringEqualVersionSingleStringQuotedMultipleWithEqualAndProgram)
     app.parse(
         R"raw(program --string="this is my quoted string" -t 'qstring 2' -m=`"quoted string"` --jstr=Unquoted)raw",
         true);
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
-    EXPECT_EQ(str4, "Unquoted");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
+    CHECK("Unquoted" == str4);
 }
 
-TEST_F(TApp, OneStringFlagLike) {
+TEST_CASE_METHOD(TApp, "OneStringFlagLike", "[app]") {
     std::string str{"something"};
     app.add_option("-s,--string", str)->expected(0, 1);
     args = {"--string"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_TRUE(str.empty());
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK(str.empty());
 }
 
-TEST_F(TApp, OneIntFlagLike) {
+TEST_CASE_METHOD(TApp, "OneIntFlagLike", "[app]") {
     int val{0};
     auto opt = app.add_option("-i", val)->expected(0, 1);
     args = {"-i"};
     run();
-    EXPECT_EQ(1u, app.count("-i"));
+    CHECK(app.count("-i") == 1u);
     opt->default_str("7");
     run();
-    EXPECT_EQ(val, 7);
+    CHECK(7 == val);
 
     opt->default_val(9);
     run();
-    EXPECT_EQ(val, 9);
+    CHECK(9 == val);
 }
 
-TEST_F(TApp, TogetherInt) {
+TEST_CASE_METHOD(TApp, "TogetherInt", "[app]") {
     int i{0};
     app.add_option("-i,--int", i);
     args = {"-i4"};
     run();
-    EXPECT_EQ(1u, app.count("--int"));
-    EXPECT_EQ(1u, app.count("-i"));
-    EXPECT_EQ(i, 4);
-    EXPECT_EQ(app["-i"]->as<std::string>(), "4");
-    EXPECT_EQ(app["--int"]->as<double>(), 4.0);
+    CHECK(app.count("--int") == 1u);
+    CHECK(app.count("-i") == 1u);
+    CHECK(4 == i);
+    CHECK("4" == app["-i"]->as<std::string>());
+    CHECK(4.0 == app["--int"]->as<double>());
 }
 
-TEST_F(TApp, SepInt) {
+TEST_CASE_METHOD(TApp, "SepInt", "[app]") {
     int i{0};
     app.add_option("-i,--int", i);
     args = {"-i", "4"};
     run();
-    EXPECT_EQ(1u, app.count("--int"));
-    EXPECT_EQ(1u, app.count("-i"));
-    EXPECT_EQ(i, 4);
+    CHECK(app.count("--int") == 1u);
+    CHECK(app.count("-i") == 1u);
+    CHECK(4 == i);
 }
 
-TEST_F(TApp, DefaultStringAgain) {
+TEST_CASE_METHOD(TApp, "DefaultStringAgain", "[app]") {
     std::string str = "previous";
     app.add_option("-s,--string", str);
     run();
-    EXPECT_EQ(0u, app.count("-s"));
-    EXPECT_EQ(0u, app.count("--string"));
-    EXPECT_EQ(str, "previous");
+    CHECK(app.count("-s") == 0u);
+    CHECK(app.count("--string") == 0u);
+    CHECK("previous" == str);
 }
 
-TEST_F(TApp, DefaultStringAgainEmpty) {
+TEST_CASE_METHOD(TApp, "DefaultStringAgainEmpty", "[app]") {
     std::string str = "previous";
     app.add_option("-s,--string", str);
     app.parse("   ");
-    EXPECT_EQ(0u, app.count("-s"));
-    EXPECT_EQ(0u, app.count("--string"));
-    EXPECT_EQ(str, "previous");
+    CHECK(app.count("-s") == 0u);
+    CHECK(app.count("--string") == 0u);
+    CHECK("previous" == str);
 }
 
-TEST_F(TApp, DualOptions) {
+TEST_CASE_METHOD(TApp, "DualOptions", "[app]") {
 
     std::string str = "previous";
     std::vector<std::string> vstr = {"previous"};
@@ -491,13 +489,13 @@ TEST_F(TApp, DualOptions) {
 
     args = {"--vector=one", "--vector=two"};
     run();
-    EXPECT_EQ(ans, vstr);
+    CHECK(vstr == ans);
 
     args = {"--string=one", "--string=two"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, LotsOfFlags) {
+TEST_CASE_METHOD(TApp, "LotsOfFlags", "[app]") {
 
     app.add_flag("-a");
     app.add_flag("-A");
@@ -505,66 +503,66 @@ TEST_F(TApp, LotsOfFlags) {
 
     args = {"-a", "-b", "-aA"};
     run();
-    EXPECT_EQ(2u, app.count("-a"));
-    EXPECT_EQ(1u, app.count("-b"));
-    EXPECT_EQ(1u, app.count("-A"));
-    EXPECT_EQ(app.count_all(), 4u);
+    CHECK(app.count("-a") == 2u);
+    CHECK(app.count("-b") == 1u);
+    CHECK(app.count("-A") == 1u);
+    CHECK(4u == app.count_all());
 }
 
-TEST_F(TApp, NumberFlags) {
+TEST_CASE_METHOD(TApp, "NumberFlags", "[app]") {
 
     int val{0};
     app.add_flag("-1{1},-2{2},-3{3},-4{4},-5{5},-6{6}, -7{7}, -8{8}, -9{9}", val);
 
     args = {"-7"};
     run();
-    EXPECT_EQ(1u, app.count("-1"));
-    EXPECT_EQ(val, 7);
+    CHECK(app.count("-1") == 1u);
+    CHECK(7 == val);
 }
 
-TEST_F(TApp, DisableFlagOverrideTest) {
+TEST_CASE_METHOD(TApp, "DisableFlagOverrideTest", "[app]") {
 
     int val{0};
     auto opt = app.add_flag("--1{1},--2{2},--3{3},--4{4},--5{5},--6{6}, --7{7}, --8{8}, --9{9}", val);
-    EXPECT_FALSE(opt->get_disable_flag_override());
+    CHECK(!opt->get_disable_flag_override());
     opt->disable_flag_override();
     args = {"--7=5"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
-    EXPECT_TRUE(opt->get_disable_flag_override());
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
+    CHECK(opt->get_disable_flag_override());
     opt->disable_flag_override(false);
-    EXPECT_FALSE(opt->get_disable_flag_override());
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(val, 5);
+    CHECK(!opt->get_disable_flag_override());
+    CHECK_NOTHROW(run());
+    CHECK(5 == val);
     opt->disable_flag_override();
     args = {"--7=7"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(TApp, LotsOfFlagsSingleString) {
+TEST_CASE_METHOD(TApp, "LotsOfFlagsSingleString", "[app]") {
 
     app.add_flag("-a");
     app.add_flag("-A");
     app.add_flag("-b");
 
     app.parse("-a -b -aA");
-    EXPECT_EQ(2u, app.count("-a"));
-    EXPECT_EQ(1u, app.count("-b"));
-    EXPECT_EQ(1u, app.count("-A"));
+    CHECK(app.count("-a") == 2u);
+    CHECK(app.count("-b") == 1u);
+    CHECK(app.count("-A") == 1u);
 }
 
-TEST_F(TApp, LotsOfFlagsSingleStringExtraSpace) {
+TEST_CASE_METHOD(TApp, "LotsOfFlagsSingleStringExtraSpace", "[app]") {
 
     app.add_flag("-a");
     app.add_flag("-A");
     app.add_flag("-b");
 
     app.parse("  -a    -b    -aA   ");
-    EXPECT_EQ(2u, app.count("-a"));
-    EXPECT_EQ(1u, app.count("-b"));
-    EXPECT_EQ(1u, app.count("-A"));
+    CHECK(app.count("-a") == 2u);
+    CHECK(app.count("-b") == 1u);
+    CHECK(app.count("-A") == 1u);
 }
 
-TEST_F(TApp, SingleArgVector) {
+TEST_CASE_METHOD(TApp, "SingleArgVector", "[app]") {
 
     std::vector<std::string> channels;
     std::vector<std::string> iargs;
@@ -574,65 +572,65 @@ TEST_F(TApp, SingleArgVector) {
     app.add_option("-p", path);
 
     app.parse("-c t1 -c t2 -c t3 a1 a2 a3 a4 -p happy");
-    EXPECT_EQ(3u, channels.size());
-    EXPECT_EQ(4u, iargs.size());
-    EXPECT_EQ(path, "happy");
+    CHECK(channels.size() == 3u);
+    CHECK(iargs.size() == 4u);
+    CHECK("happy" == path);
 
     app.parse("-c t1 a1 -c t2 -c t3 a2 a3 a4 -p happy");
-    EXPECT_EQ(3u, channels.size());
-    EXPECT_EQ(4u, iargs.size());
-    EXPECT_EQ(path, "happy");
+    CHECK(channels.size() == 3u);
+    CHECK(iargs.size() == 4u);
+    CHECK("happy" == path);
 }
 
-TEST_F(TApp, FlagLikeOption) {
+TEST_CASE_METHOD(TApp, "FlagLikeOption", "[app]") {
     bool val{false};
     auto opt = app.add_option("--flag", val)->type_size(0)->default_str("true");
     args = {"--flag"};
     run();
-    EXPECT_EQ(1u, app.count("--flag"));
-    EXPECT_TRUE(val);
+    CHECK(app.count("--flag") == 1u);
+    CHECK(val);
     val = false;
     opt->type_size(0, 0);  // should be the same as above
-    EXPECT_EQ(opt->get_type_size_min(), 0);
-    EXPECT_EQ(opt->get_type_size_max(), 0);
+    CHECK(0 == opt->get_type_size_min());
+    CHECK(0 == opt->get_type_size_max());
     run();
-    EXPECT_EQ(1u, app.count("--flag"));
-    EXPECT_TRUE(val);
+    CHECK(app.count("--flag") == 1u);
+    CHECK(val);
 }
 
-TEST_F(TApp, FlagLikeIntOption) {
+TEST_CASE_METHOD(TApp, "FlagLikeIntOption", "[app]") {
     int val{-47};
     auto opt = app.add_option("--flag", val)->expected(0, 1);
     // normally some default value should be set, but this test is for some paths in the validators checks to skip
     // validation on empty string if nothing is expected
     opt->check(CLI::PositiveNumber);
     args = {"--flag"};
-    EXPECT_TRUE(opt->as<std::string>().empty());
+    CHECK(opt->as<std::string>().empty());
     run();
-    EXPECT_EQ(1u, app.count("--flag"));
-    EXPECT_NE(val, -47);
+    CHECK(app.count("--flag") == 1u);
+    CHECK(-47 != val);
     args = {"--flag", "12"};
     run();
 
-    EXPECT_EQ(val, 12);
+    CHECK(12 == val);
     args.clear();
     run();
-    EXPECT_TRUE(opt->as<std::string>().empty());
+    CHECK(opt->as<std::string>().empty());
 }
 
-TEST_F(TApp, BoolOnlyFlag) {
+TEST_CASE_METHOD(TApp, "BoolOnlyFlag", "[app]") {
     bool bflag{false};
     app.add_flag("-b", bflag)->multi_option_policy(CLI::MultiOptionPolicy::Throw);
 
     args = {"-b"};
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(bflag);
+    REQUIRE_NOTHROW(run());
+    CHECK(bflag);
 
     args = {"-b", "-b"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, ShortOpts) {
+TEST_CASE_METHOD(TApp, "ShortOpts", "[app]") {
 
     std::uint64_t funnyint{0};
     std::string someopt;
@@ -645,14 +643,14 @@ TEST_F(TApp, ShortOpts) {
 
     run();
 
-    EXPECT_EQ(2u, app.count("-z"));
-    EXPECT_EQ(1u, app.count("-y"));
-    EXPECT_EQ(std::uint64_t{2}, funnyint);
-    EXPECT_EQ("zyz", someopt);
-    EXPECT_EQ(app.count_all(), 3u);
+    CHECK(app.count("-z") == 2u);
+    CHECK(app.count("-y") == 1u);
+    CHECK(funnyint == std::uint64_t{2});
+    CHECK(someopt == "zyz");
+    CHECK(3u == app.count_all());
 }
 
-TEST_F(TApp, TwoParamTemplateOpts) {
+TEST_CASE_METHOD(TApp, "TwoParamTemplateOpts", "[app]") {
 
     double funnyint{0.0};
     auto opt = app.add_option<double, unsigned int>("-y", funnyint);
@@ -661,19 +659,19 @@ TEST_F(TApp, TwoParamTemplateOpts) {
 
     run();
 
-    EXPECT_EQ(32.0, funnyint);
+    CHECK(funnyint == 32.0);
 
     args = {"-y", "32.3"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     args = {"-y", "-19"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     opt->capture_default_str();
-    EXPECT_TRUE(opt->get_default_str().empty());
+    CHECK(opt->get_default_str().empty());
 }
 
-TEST_F(TApp, DefaultOpts) {
+TEST_CASE_METHOD(TApp, "DefaultOpts", "[app]") {
 
     int i{3};
     std::string s = "HI";
@@ -685,13 +683,13 @@ TEST_F(TApp, DefaultOpts) {
 
     run();
 
-    EXPECT_EQ(1u, app.count("i"));
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(2, i);
-    EXPECT_EQ("9", s);
+    CHECK(app.count("i") == 1u);
+    CHECK(app.count("-s") == 1u);
+    CHECK(i == 2);
+    CHECK(s == "9");
 }
 
-TEST_F(TApp, TakeLastOpt) {
+TEST_CASE_METHOD(TApp, "TakeLastOpt", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->multi_option_policy(CLI::MultiOptionPolicy::TakeLast);
@@ -700,10 +698,10 @@ TEST_F(TApp, TakeLastOpt) {
 
     run();
 
-    EXPECT_EQ(str, "two");
+    CHECK("two" == str);
 }
 
-TEST_F(TApp, TakeLastOpt2) {
+TEST_CASE_METHOD(TApp, "TakeLastOpt2", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->take_last();
@@ -712,10 +710,10 @@ TEST_F(TApp, TakeLastOpt2) {
 
     run();
 
-    EXPECT_EQ(str, "two");
+    CHECK("two" == str);
 }
 
-TEST_F(TApp, TakeFirstOpt) {
+TEST_CASE_METHOD(TApp, "TakeFirstOpt", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->multi_option_policy(CLI::MultiOptionPolicy::TakeFirst);
@@ -724,10 +722,10 @@ TEST_F(TApp, TakeFirstOpt) {
 
     run();
 
-    EXPECT_EQ(str, "one");
+    CHECK("one" == str);
 }
 
-TEST_F(TApp, TakeFirstOpt2) {
+TEST_CASE_METHOD(TApp, "TakeFirstOpt2", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->take_first();
@@ -736,10 +734,10 @@ TEST_F(TApp, TakeFirstOpt2) {
 
     run();
 
-    EXPECT_EQ(str, "one");
+    CHECK("one" == str);
 }
 
-TEST_F(TApp, JoinOpt) {
+TEST_CASE_METHOD(TApp, "JoinOpt", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->multi_option_policy(CLI::MultiOptionPolicy::Join);
@@ -748,10 +746,10 @@ TEST_F(TApp, JoinOpt) {
 
     run();
 
-    EXPECT_EQ(str, "one\ntwo");
+    CHECK("one\ntwo" == str);
 }
 
-TEST_F(TApp, JoinOpt2) {
+TEST_CASE_METHOD(TApp, "JoinOpt2", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->join();
@@ -760,10 +758,10 @@ TEST_F(TApp, JoinOpt2) {
 
     run();
 
-    EXPECT_EQ(str, "one\ntwo");
+    CHECK("one\ntwo" == str);
 }
 
-TEST_F(TApp, TakeLastOptMulti) {
+TEST_CASE_METHOD(TApp, "TakeLastOptMulti", "[app]") {
     std::vector<int> vals;
     app.add_option("--long", vals)->expected(2)->take_last();
 
@@ -771,10 +769,10 @@ TEST_F(TApp, TakeLastOptMulti) {
 
     run();
 
-    EXPECT_EQ(vals, std::vector<int>({2, 3}));
+    CHECK(std::vector<int>({2, 3}) == vals);
 }
 
-TEST_F(TApp, TakeLastOptMulti_alternative_path) {
+TEST_CASE_METHOD(TApp, "TakeLastOptMulti_alternative_path", "[app]") {
     std::vector<int> vals;
     app.add_option("--long", vals)->expected(2, -1)->take_last();
 
@@ -782,10 +780,10 @@ TEST_F(TApp, TakeLastOptMulti_alternative_path) {
 
     run();
 
-    EXPECT_EQ(vals, std::vector<int>({2, 3}));
+    CHECK(std::vector<int>({2, 3}) == vals);
 }
 
-TEST_F(TApp, TakeLastOptMultiCheck) {
+TEST_CASE_METHOD(TApp, "TakeLastOptMultiCheck", "[app]") {
     std::vector<int> vals;
     auto opt = app.add_option("--long", vals)->expected(-2)->take_last();
 
@@ -793,12 +791,12 @@ TEST_F(TApp, TakeLastOptMultiCheck) {
     opt->check((!CLI::PositiveNumber).application_index(1));
     args = {"--long", "-1", "2", "-3"};
 
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
-    EXPECT_EQ(vals, std::vector<int>({2, -3}));
+    CHECK(std::vector<int>({2, -3}) == vals);
 }
 
-TEST_F(TApp, TakeFirstOptMulti) {
+TEST_CASE_METHOD(TApp, "TakeFirstOptMulti", "[app]") {
     std::vector<int> vals;
     app.add_option("--long", vals)->expected(2)->take_first();
 
@@ -806,10 +804,10 @@ TEST_F(TApp, TakeFirstOptMulti) {
 
     run();
 
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 
-TEST_F(TApp, ComplexOptMulti) {
+TEST_CASE_METHOD(TApp, "ComplexOptMulti", "[app]") {
     std::complex<double> val;
     app.add_complex("--long", val)->take_first()->allow_extra_args();
 
@@ -817,35 +815,35 @@ TEST_F(TApp, ComplexOptMulti) {
 
     run();
 
-    EXPECT_DOUBLE_EQ(val.real(), 1);
-    EXPECT_DOUBLE_EQ(val.imag(), 2);
+    CHECK(1 == Approx(val.real()));
+    CHECK(2 == Approx(val.imag()));
 }
 
-TEST_F(TApp, MissingValueNonRequiredOpt) {
+TEST_CASE_METHOD(TApp, "MissingValueNonRequiredOpt", "[app]") {
     int count{0};
     app.add_option("-c,--count", count);
 
     args = {"-c"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--count"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, MissingValueMoreThan) {
+TEST_CASE_METHOD(TApp, "MissingValueMoreThan", "[app]") {
     std::vector<int> vals1;
     std::vector<int> vals2;
     app.add_option("-v", vals1)->expected(-2);
     app.add_option("--vals", vals2)->expected(-2);
 
     args = {"-v", "2"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--vals", "4"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, NoMissingValueMoreThan) {
+TEST_CASE_METHOD(TApp, "NoMissingValueMoreThan", "[app]") {
     std::vector<int> vals1;
     std::vector<int> vals2;
     app.add_option("-v", vals1)->expected(-2);
@@ -853,104 +851,104 @@ TEST_F(TApp, NoMissingValueMoreThan) {
 
     args = {"-v", "2", "3", "4"};
     run();
-    EXPECT_EQ(vals1, std::vector<int>({2, 3, 4}));
+    CHECK(std::vector<int>({2, 3, 4}) == vals1);
 
     args = {"--vals", "2", "3", "4"};
     run();
-    EXPECT_EQ(vals2, std::vector<int>({2, 3, 4}));
+    CHECK(std::vector<int>({2, 3, 4}) == vals2);
 }
 
-TEST_F(TApp, NotRequiredOptsSingle) {
+TEST_CASE_METHOD(TApp, "NotRequiredOptsSingle", "[app]") {
 
     std::string str;
     app.add_option("--str", str);
 
     args = {"--str"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, NotRequiredOptsSingleShort) {
+TEST_CASE_METHOD(TApp, "NotRequiredOptsSingleShort", "[app]") {
 
     std::string str;
     app.add_option("-s", str);
 
     args = {"-s"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, RequiredOptsSingle) {
+TEST_CASE_METHOD(TApp, "RequiredOptsSingle", "[app]") {
 
     std::string str;
     app.add_option("--str", str)->required();
 
     args = {"--str"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, RequiredOptsSingleShort) {
+TEST_CASE_METHOD(TApp, "RequiredOptsSingleShort", "[app]") {
 
     std::string str;
     app.add_option("-s", str)->required();
 
     args = {"-s"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, RequiredOptsDouble) {
+TEST_CASE_METHOD(TApp, "RequiredOptsDouble", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("--str", strs)->required()->expected(2);
 
     args = {"--str", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--str", "one", "two"};
 
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 }
 
-TEST_F(TApp, RequiredOptsDoubleShort) {
+TEST_CASE_METHOD(TApp, "RequiredOptsDoubleShort", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("-s", strs)->required()->expected(2);
 
     args = {"-s", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"-s", "one", "-s", "one", "-s", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, RequiredOptsDoubleNeg) {
+TEST_CASE_METHOD(TApp, "RequiredOptsDoubleNeg", "[app]") {
     std::vector<std::string> strs;
     app.add_option("-s", strs)->required()->expected(-2);
 
     args = {"-s", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"-s", "one", "two", "-s", "three"};
 
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two", "three"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"one", "two", "three"}) == strs);
 
     args = {"-s", "one", "two"};
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 }
 
 // This makes sure unlimited option priority is
 // correct for space vs. no space #90
-TEST_F(TApp, PositionalNoSpace) {
+TEST_CASE_METHOD(TApp, "PositionalNoSpace", "[app]") {
     std::vector<std::string> options;
     std::string foo, bar;
 
@@ -961,37 +959,37 @@ TEST_F(TApp, PositionalNoSpace) {
     args = {"-O", "Test", "param1", "param2"};
     run();
 
-    EXPECT_EQ(options.size(), 1u);
-    EXPECT_EQ(options.at(0), "Test");
+    CHECK(1u == options.size());
+    CHECK("Test" == options.at(0));
 
     args = {"-OTest", "param1", "param2"};
     run();
 
-    EXPECT_EQ(options.size(), 1u);
-    EXPECT_EQ(options.at(0), "Test");
+    CHECK(1u == options.size());
+    CHECK("Test" == options.at(0));
 }
 
 // Tests positionals at end
-TEST_F(TApp, PositionalAtEnd) {
+TEST_CASE_METHOD(TApp, "PositionalAtEnd", "[app]") {
     std::string options;
     std::string foo;
 
     app.add_option("-O", options);
     app.add_option("foo", foo);
     app.positionals_at_end();
-    EXPECT_TRUE(app.get_positionals_at_end());
+    CHECK(app.get_positionals_at_end());
     args = {"-O", "Test", "param1"};
     run();
 
-    EXPECT_EQ(options, "Test");
-    EXPECT_EQ(foo, "param1");
+    CHECK("Test" == options);
+    CHECK("param1" == foo);
 
     args = {"param2", "-O", "Test"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
 // Tests positionals at end
-TEST_F(TApp, RequiredPositionals) {
+TEST_CASE_METHOD(TApp, "RequiredPositionals", "[app]") {
     std::vector<std::string> sources;
     std::string dest;
     app.add_option("src", sources);
@@ -1001,18 +999,18 @@ TEST_F(TApp, RequiredPositionals) {
     args = {"1", "2", "3"};
     run();
 
-    EXPECT_EQ(sources.size(), 2u);
-    EXPECT_EQ(dest, "3");
+    CHECK(2u == sources.size());
+    CHECK("3" == dest);
 
     args = {"a"};
     sources.clear();
     run();
 
-    EXPECT_EQ(sources.size(), 0u);
-    EXPECT_EQ(dest, "a");
+    CHECK(0u == sources.size());
+    CHECK("a" == dest);
 }
 
-TEST_F(TApp, RequiredPositionalVector) {
+TEST_CASE_METHOD(TApp, "RequiredPositionalVector", "[app]") {
     std::string d1;
     std::string d2;
     std::string d3;
@@ -1028,19 +1026,19 @@ TEST_F(TApp, RequiredPositionalVector) {
     args = {"1", "2", "3"};
     run();
 
-    EXPECT_EQ(sources.size(), 1u);
-    EXPECT_EQ(d1, "1");
-    EXPECT_EQ(d2, "2");
-    EXPECT_TRUE(d3.empty());
+    CHECK(1u == sources.size());
+    CHECK("1" == d1);
+    CHECK("2" == d2);
+    CHECK(d3.empty());
     args = {"a"};
     sources.clear();
     run();
 
-    EXPECT_EQ(sources.size(), 1u);
+    CHECK(1u == sources.size());
 }
 
 // Tests positionals at end
-TEST_F(TApp, RequiredPositionalValidation) {
+TEST_CASE_METHOD(TApp, "RequiredPositionalValidation", "[app]") {
     std::vector<std::string> sources;
     int dest;  // required
     std::string d2;
@@ -1052,13 +1050,13 @@ TEST_F(TApp, RequiredPositionalValidation) {
     args = {"1", "2", "string", "3"};
     run();
 
-    EXPECT_EQ(sources.size(), 2u);
-    EXPECT_EQ(dest, 3);
-    EXPECT_EQ(d2, "string");
+    CHECK(2u == sources.size());
+    CHECK(3 == dest);
+    CHECK("string" == d2);
 }
 
 // Tests positionals at end
-TEST_F(TApp, PositionalValidation) {
+TEST_CASE_METHOD(TApp, "PositionalValidation", "[app]") {
     std::string options;
     std::string foo;
 
@@ -1069,19 +1067,19 @@ TEST_F(TApp, PositionalValidation) {
     args = {"1", "param1"};
     run();
 
-    EXPECT_EQ(options, "1");
-    EXPECT_EQ(foo, "param1");
+    CHECK("1" == options);
+    CHECK("param1" == foo);
 
     args = {"param1", "1"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
-    EXPECT_EQ(options, "1");
-    EXPECT_EQ(foo, "param1");
+    CHECK("1" == options);
+    CHECK("param1" == foo);
 
-    EXPECT_NE(app.get_option("bar")->get_validator("valbar"), nullptr);
+    CHECK(nullptr != app.get_option("bar")->get_validator("valbar"));
 }
 
-TEST_F(TApp, PositionalNoSpaceLong) {
+TEST_CASE_METHOD(TApp, "PositionalNoSpaceLong", "[app]") {
     std::vector<std::string> options;
     std::string foo, bar;
 
@@ -1092,107 +1090,107 @@ TEST_F(TApp, PositionalNoSpaceLong) {
     args = {"--option", "Test", "param1", "param2"};
     run();
 
-    EXPECT_EQ(options.size(), 1u);
-    EXPECT_EQ(options.at(0), "Test");
+    CHECK(1u == options.size());
+    CHECK("Test" == options.at(0));
 
     args = {"--option=Test", "param1", "param2"};
     run();
 
-    EXPECT_EQ(options.size(), 1u);
-    EXPECT_EQ(options.at(0), "Test");
+    CHECK(1u == options.size());
+    CHECK("Test" == options.at(0));
 }
 
-TEST_F(TApp, RequiredOptsUnlimited) {
+TEST_CASE_METHOD(TApp, "RequiredOptsUnlimited", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("--str", strs)->required();
 
     args = {"--str"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--str", "one", "--str", "two"};
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 
     args = {"--str", "one", "two"};
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 
     // It's better to feed a hungry option than to feed allow_extras
     app.allow_extras();
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
+    CHECK(std::vector<std::string>({}) == app.remaining());
 
     app.allow_extras(false);
     std::vector<std::string> remain;
     auto popt = app.add_option("positional", remain);
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(remain, std::vector<std::string>());
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
+    CHECK(std::vector<std::string>() == remain);
 
     args = {"--str", "one", "--", "two"};
 
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two"}));
+    CHECK(std::vector<std::string>({"one"}) == strs);
+    CHECK(std::vector<std::string>({"two"}) == remain);
 
     args = {"one", "--str", "two"};
 
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"two"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"one"}));
+    CHECK(std::vector<std::string>({"two"}) == strs);
+    CHECK(std::vector<std::string>({"one"}) == remain);
 
     args = {"--str", "one", "two"};
     popt->required();
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two"}));
+    CHECK(std::vector<std::string>({"one"}) == strs);
+    CHECK(std::vector<std::string>({"two"}) == remain);
 }
 
-TEST_F(TApp, RequiredOptsUnlimitedShort) {
+TEST_CASE_METHOD(TApp, "RequiredOptsUnlimitedShort", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("-s", strs)->required();
 
     args = {"-s"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"-s", "one", "-s", "two"};
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 
     args = {"-s", "one", "two"};
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
 
     // It's better to feed a hungry option than to feed allow_extras
     app.allow_extras();
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({}));
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
+    CHECK(std::vector<std::string>({}) == app.remaining());
 
     app.allow_extras(false);
     std::vector<std::string> remain;
     app.add_option("positional", remain);
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(remain, std::vector<std::string>());
+    CHECK(std::vector<std::string>({"one", "two"}) == strs);
+    CHECK(std::vector<std::string>() == remain);
 
     args = {"-s", "one", "--", "two"};
 
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"one"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two"}));
+    CHECK(std::vector<std::string>({"one"}) == strs);
+    CHECK(std::vector<std::string>({"two"}) == remain);
 
     args = {"one", "-s", "two"};
 
     run();
-    EXPECT_EQ(strs, std::vector<std::string>({"two"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"one"}));
+    CHECK(std::vector<std::string>({"two"}) == strs);
+    CHECK(std::vector<std::string>({"one"}) == remain);
 }
 
-TEST_F(TApp, OptsUnlimitedEnd) {
+TEST_CASE_METHOD(TApp, "OptsUnlimitedEnd", "[app]") {
     std::vector<std::string> strs;
     app.add_option("-s,--str", strs);
     app.allow_extras();
@@ -1201,11 +1199,11 @@ TEST_F(TApp, OptsUnlimitedEnd) {
 
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"two", "three"}));
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"one", "four"}));
+    CHECK(std::vector<std::string>({"two", "three"}) == strs);
+    CHECK(std::vector<std::string>({"one", "four"}) == app.remaining());
 }
 
-TEST_F(TApp, RequireOptPriority) {
+TEST_CASE_METHOD(TApp, "RequireOptPriority", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("--str", strs);
@@ -1215,17 +1213,17 @@ TEST_F(TApp, RequireOptPriority) {
     args = {"--str", "one", "two", "three"};
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"one"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two", "three"}));
+    CHECK(std::vector<std::string>({"one"}) == strs);
+    CHECK(std::vector<std::string>({"two", "three"}) == remain);
 
     args = {"two", "three", "--str", "one", "four"};
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "four"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two", "three"}));
+    CHECK(std::vector<std::string>({"one", "four"}) == strs);
+    CHECK(std::vector<std::string>({"two", "three"}) == remain);
 }
 
-TEST_F(TApp, RequireOptPriorityShort) {
+TEST_CASE_METHOD(TApp, "RequireOptPriorityShort", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("-s", strs)->required();
@@ -1235,53 +1233,53 @@ TEST_F(TApp, RequireOptPriorityShort) {
     args = {"-s", "one", "two", "three"};
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"one"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two", "three"}));
+    CHECK(std::vector<std::string>({"one"}) == strs);
+    CHECK(std::vector<std::string>({"two", "three"}) == remain);
 
     args = {"two", "three", "-s", "one", "four"};
     run();
 
-    EXPECT_EQ(strs, std::vector<std::string>({"one", "four"}));
-    EXPECT_EQ(remain, std::vector<std::string>({"two", "three"}));
+    CHECK(std::vector<std::string>({"one", "four"}) == strs);
+    CHECK(std::vector<std::string>({"two", "three"}) == remain);
 }
 
-TEST_F(TApp, NotRequiredExpectedDouble) {
+TEST_CASE_METHOD(TApp, "NotRequiredExpectedDouble", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("--str", strs)->expected(2);
 
     args = {"--str", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, NotRequiredExpectedDoubleShort) {
+TEST_CASE_METHOD(TApp, "NotRequiredExpectedDoubleShort", "[app]") {
 
     std::vector<std::string> strs;
     app.add_option("-s", strs)->expected(2);
 
     args = {"-s", "one"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, RequiredFlags) {
+TEST_CASE_METHOD(TApp, "RequiredFlags", "[app]") {
     app.add_flag("-a")->required();
     app.add_flag("-b")->mandatory();  // Alternate term
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"-a"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"-b"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"-a", "-b"};
     run();
 }
 
-TEST_F(TApp, CallbackFlags) {
+TEST_CASE_METHOD(TApp, "CallbackFlags", "[app]") {
 
     std::int64_t value{0};
 
@@ -1290,20 +1288,20 @@ TEST_F(TApp, CallbackFlags) {
     app.add_flag_function("-v", func);
 
     run();
-    EXPECT_EQ(value, 0u);
+    CHECK(0u == value);
 
     args = {"-v"};
     run();
-    EXPECT_EQ(value, 1u);
+    CHECK(1u == value);
 
     args = {"-vv"};
     run();
-    EXPECT_EQ(value, 2u);
+    CHECK(2u == value);
 
-    EXPECT_THROW(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, CallbackFlagsFalse) {
+TEST_CASE_METHOD(TApp, "CallbackFlagsFalse", "[app]") {
     std::int64_t value = 0;
 
     auto func = [&value](std::int64_t x) { value = x; };
@@ -1311,28 +1309,28 @@ TEST_F(TApp, CallbackFlagsFalse) {
     app.add_flag_function("-v,-f{false},--val,--fval{false}", func);
 
     run();
-    EXPECT_EQ(value, 0);
+    CHECK(0 == value);
 
     args = {"-f"};
     run();
-    EXPECT_EQ(value, -1);
+    CHECK(-1 == value);
 
     args = {"-vfv"};
     run();
-    EXPECT_EQ(value, 1);
+    CHECK(1 == value);
 
     args = {"--fval"};
     run();
-    EXPECT_EQ(value, -1);
+    CHECK(-1 == value);
 
     args = {"--fval=2"};
     run();
-    EXPECT_EQ(value, -2);
+    CHECK(-2 == value);
 
-    EXPECT_THROW(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, CallbackFlagsFalseShortcut) {
+TEST_CASE_METHOD(TApp, "CallbackFlagsFalseShortcut", "[app]") {
     std::int64_t value = 0;
 
     auto func = [&value](std::int64_t x) { value = x; };
@@ -1340,29 +1338,29 @@ TEST_F(TApp, CallbackFlagsFalseShortcut) {
     app.add_flag_function("-v,!-f,--val,!--fval", func);
 
     run();
-    EXPECT_EQ(value, 0);
+    CHECK(0 == value);
 
     args = {"-f"};
     run();
-    EXPECT_EQ(value, -1);
+    CHECK(-1 == value);
 
     args = {"-vfv"};
     run();
-    EXPECT_EQ(value, 1);
+    CHECK(1 == value);
 
     args = {"--fval"};
     run();
-    EXPECT_EQ(value, -1);
+    CHECK(-1 == value);
 
     args = {"--fval=2"};
     run();
-    EXPECT_EQ(value, -2);
+    CHECK(-2 == value);
 
-    EXPECT_THROW(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag_function("hi", func), CLI::IncorrectConstruction);
 }
 
 #if __cplusplus >= 201402L || _MSC_VER >= 1900
-TEST_F(TApp, CallbackFlagsAuto) {
+TEST_CASE_METHOD(TApp, "CallbackFlagsAuto", "[app]") {
 
     std::int64_t value{0};
 
@@ -1371,21 +1369,21 @@ TEST_F(TApp, CallbackFlagsAuto) {
     app.add_flag("-v", func);
 
     run();
-    EXPECT_EQ(value, 0u);
+    CHECK(0u == value);
 
     args = {"-v"};
     run();
-    EXPECT_EQ(value, 1u);
+    CHECK(1u == value);
 
     args = {"-vv"};
     run();
-    EXPECT_EQ(value, 2u);
+    CHECK(2u == value);
 
-    EXPECT_THROW(app.add_flag("hi", func), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag("hi", func), CLI::IncorrectConstruction);
 }
 #endif
 
-TEST_F(TApp, Positionals) {
+TEST_CASE_METHOD(TApp, "Positionals", "[app]") {
 
     std::string posit1;
     std::string posit2;
@@ -1396,13 +1394,13 @@ TEST_F(TApp, Positionals) {
 
     run();
 
-    EXPECT_EQ(1u, app.count("posit1"));
-    EXPECT_EQ(1u, app.count("posit2"));
-    EXPECT_EQ("thing1", posit1);
-    EXPECT_EQ("thing2", posit2);
+    CHECK(app.count("posit1") == 1u);
+    CHECK(app.count("posit2") == 1u);
+    CHECK(posit1 == "thing1");
+    CHECK(posit2 == "thing2");
 }
 
-TEST_F(TApp, ForcedPositional) {
+TEST_CASE_METHOD(TApp, "ForcedPositional", "[app]") {
     std::vector<std::string> posit;
     auto one = app.add_flag("--one");
     app.add_option("posit", posit);
@@ -1410,18 +1408,18 @@ TEST_F(TApp, ForcedPositional) {
     args = {"--one", "two", "three"};
     run();
     std::vector<std::string> answers1 = {"two", "three"};
-    EXPECT_TRUE(one->count());
-    EXPECT_EQ(answers1, posit);
+    CHECK(one->count());
+    CHECK(posit == answers1);
 
     args = {"--", "--one", "two", "three"};
     std::vector<std::string> answers2 = {"--one", "two", "three"};
     run();
 
-    EXPECT_FALSE(one->count());
-    EXPECT_EQ(answers2, posit);
+    CHECK(!one->count());
+    CHECK(posit == answers2);
 }
 
-TEST_F(TApp, MixedPositionals) {
+TEST_CASE_METHOD(TApp, "MixedPositionals", "[app]") {
 
     int positional_int{0};
     std::string positional_string;
@@ -1432,28 +1430,28 @@ TEST_F(TApp, MixedPositionals) {
 
     run();
 
-    EXPECT_EQ(1u, app.count("posit2"));
-    EXPECT_EQ(1u, app.count("--posit1"));
-    EXPECT_EQ(7, positional_int);
-    EXPECT_EQ("thing2", positional_string);
+    CHECK(app.count("posit2") == 1u);
+    CHECK(app.count("--posit1") == 1u);
+    CHECK(positional_int == 7);
+    CHECK(positional_string == "thing2");
 }
 
-TEST_F(TApp, BigPositional) {
+TEST_CASE_METHOD(TApp, "BigPositional", "[app]") {
     std::vector<std::string> vec;
     app.add_option("pos", vec);
 
     args = {"one"};
 
     run();
-    EXPECT_EQ(args, vec);
+    CHECK(vec == args);
 
     args = {"one", "two"};
     run();
 
-    EXPECT_EQ(args, vec);
+    CHECK(vec == args);
 }
 
-TEST_F(TApp, Reset) {
+TEST_CASE_METHOD(TApp, "Reset", "[app]") {
 
     app.add_flag("--simple");
     double doub{0.0};
@@ -1463,139 +1461,139 @@ TEST_F(TApp, Reset) {
 
     run();
 
-    EXPECT_EQ(1u, app.count("--simple"));
-    EXPECT_EQ(1u, app.count("-d"));
-    EXPECT_DOUBLE_EQ(1.2, doub);
+    CHECK(app.count("--simple") == 1u);
+    CHECK(app.count("-d") == 1u);
+    CHECK(doub == Approx(1.2));
 
     app.clear();
 
-    EXPECT_EQ(0u, app.count("--simple"));
-    EXPECT_EQ(0u, app.count("-d"));
+    CHECK(app.count("--simple") == 0u);
+    CHECK(app.count("-d") == 0u);
 
     run();
 
-    EXPECT_EQ(1u, app.count("--simple"));
-    EXPECT_EQ(1u, app.count("-d"));
-    EXPECT_DOUBLE_EQ(1.2, doub);
+    CHECK(app.count("--simple") == 1u);
+    CHECK(app.count("-d") == 1u);
+    CHECK(doub == Approx(1.2));
 }
 
-TEST_F(TApp, RemoveOption) {
+TEST_CASE_METHOD(TApp, "RemoveOption", "[app]") {
     app.add_flag("--one");
     auto opt = app.add_flag("--two");
 
-    EXPECT_TRUE(app.remove_option(opt));
-    EXPECT_FALSE(app.remove_option(opt));
+    CHECK(app.remove_option(opt));
+    CHECK(!app.remove_option(opt));
 
     args = {"--two"};
 
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, RemoveNeedsLinks) {
+TEST_CASE_METHOD(TApp, "RemoveNeedsLinks", "[app]") {
     auto one = app.add_flag("--one");
     auto two = app.add_flag("--two");
 
     two->needs(one);
     one->needs(two);
 
-    EXPECT_TRUE(app.remove_option(one));
+    CHECK(app.remove_option(one));
 
     args = {"--two"};
 
     run();
 }
 
-TEST_F(TApp, RemoveExcludesLinks) {
+TEST_CASE_METHOD(TApp, "RemoveExcludesLinks", "[app]") {
     auto one = app.add_flag("--one");
     auto two = app.add_flag("--two");
 
     two->excludes(one);
     one->excludes(two);
 
-    EXPECT_TRUE(app.remove_option(one));
+    CHECK(app.remove_option(one));
 
     args = {"--two"};
 
     run();  // Mostly hoping it does not crash
 }
 
-TEST_F(TApp, FileNotExists) {
+TEST_CASE_METHOD(TApp, "FileNotExists", "[app]") {
     std::string myfile{"TestNonFileNotUsed.txt"};
-    ASSERT_NO_THROW(CLI::NonexistentPath(myfile));
+    REQUIRE_NOTHROW(CLI::NonexistentPath(myfile));
 
     std::string filename;
     auto opt = app.add_option("--file", filename)->check(CLI::NonexistentPath, "path_check");
     args = {"--file", myfile};
 
     run();
-    EXPECT_EQ(myfile, filename);
+    CHECK(filename == myfile);
 
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK(ok);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     // deactivate the check, so it should run now
     opt->get_validator("path_check")->active(false);
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK(!CLI::ExistingFile(myfile).empty());
 }
 
-TEST_F(TApp, FileExists) {
+TEST_CASE_METHOD(TApp, "FileExists", "[app]") {
     std::string myfile{"TestNonFileNotUsed.txt"};
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK(!CLI::ExistingFile(myfile).empty());
 
     std::string filename = "Failed";
     app.add_option("--file", filename)->check(CLI::ExistingFile);
     args = {"--file", myfile};
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
+    CHECK(ok);
     run();
-    EXPECT_EQ(myfile, filename);
+    CHECK(filename == myfile);
 
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK(!CLI::ExistingFile(myfile).empty());
 }
 
-TEST_F(TApp, NotFileExists) {
+TEST_CASE_METHOD(TApp, "NotFileExists", "[app]") {
     std::string myfile{"TestNonFileNotUsed.txt"};
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK(!CLI::ExistingFile(myfile).empty());
 
     std::string filename = "Failed";
     app.add_option("--file", filename)->check(!CLI::ExistingFile);
     args = {"--file", myfile};
 
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK(ok);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK(!CLI::ExistingFile(myfile).empty());
 }
 
-TEST_F(TApp, DefaultedResult) {
+TEST_CASE_METHOD(TApp, "DefaultedResult", "[app]") {
     std::string sval = "NA";
     int ival{0};
     auto opts = app.add_option("--string", sval)->capture_default_str();
     auto optv = app.add_option("--val", ival);
     args = {};
     run();
-    EXPECT_EQ(sval, "NA");
+    CHECK("NA" == sval);
     std::string nString;
     opts->results(nString);
-    EXPECT_EQ(nString, "NA");
+    CHECK("NA" == nString);
     int newIval;
-    // EXPECT_THROW(optv->results(newIval), CLI::ConversionError);
+    // CHECK_THROWS_AS (optv->results(newIval), CLI::ConversionError);
     optv->default_str("442");
     optv->results(newIval);
-    EXPECT_EQ(newIval, 442);
+    CHECK(442 == newIval);
 }
 
-TEST_F(TApp, OriginalOrder) {
+TEST_CASE_METHOD(TApp, "OriginalOrder", "[app]") {
     std::vector<int> st1;
     CLI::Option *op1 = app.add_option("-a", st1);
     std::vector<int> st2;
@@ -1605,13 +1603,13 @@ TEST_F(TApp, OriginalOrder) {
 
     run();
 
-    EXPECT_EQ(st1, std::vector<int>({1, 3, 4}));
-    EXPECT_EQ(st2, std::vector<int>({2}));
+    CHECK(std::vector<int>({1, 3, 4}) == st1);
+    CHECK(std::vector<int>({2}) == st2);
 
-    EXPECT_EQ(app.parse_order(), std::vector<CLI::Option *>({op1, op2, op1, op1}));
+    CHECK(std::vector<CLI::Option *>({op1, op2, op1, op1}) == app.parse_order());
 }
 
-TEST_F(TApp, NeedsFlags) {
+TEST_CASE_METHOD(TApp, "NeedsFlags", "[app]") {
     CLI::Option *opt = app.add_flag("-s,--string");
     app.add_flag("--both")->needs(opt);
 
@@ -1624,12 +1622,12 @@ TEST_F(TApp, NeedsFlags) {
     run();
 
     args = {"--both"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
-    EXPECT_NO_THROW(opt->needs(opt));
+    CHECK_NOTHROW(opt->needs(opt));
 }
 
-TEST_F(TApp, ExcludesFlags) {
+TEST_CASE_METHOD(TApp, "ExcludesFlags", "[app]") {
     CLI::Option *opt = app.add_flag("-s,--string");
     app.add_flag("--nostr")->excludes(opt);
 
@@ -1642,15 +1640,15 @@ TEST_F(TApp, ExcludesFlags) {
     run();
 
     args = {"--nostr", "-s"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 
     args = {"--string", "--nostr"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 
-    EXPECT_THROW(opt->excludes(opt), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(opt->excludes(opt), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, ExcludesMixedFlags) {
+TEST_CASE_METHOD(TApp, "ExcludesMixedFlags", "[app]") {
     CLI::Option *opt1 = app.add_flag("--opt1");
     app.add_flag("--opt2");
     CLI::Option *opt3 = app.add_flag("--opt3");
@@ -1665,13 +1663,13 @@ TEST_F(TApp, ExcludesMixedFlags) {
     run();
 
     args = {"--no", "--opt1"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 
     args = {"--no", "--opt2"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 }
 
-TEST_F(TApp, NeedsMultiFlags) {
+TEST_CASE_METHOD(TApp, "NeedsMultiFlags", "[app]") {
     CLI::Option *opt1 = app.add_flag("--opt1");
     CLI::Option *opt2 = app.add_flag("--opt2");
     CLI::Option *opt3 = app.add_flag("--opt3");
@@ -1686,19 +1684,19 @@ TEST_F(TApp, NeedsMultiFlags) {
     run();
 
     args = {"--optall"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt2", "--opt1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt1", "--opt2", "--opt3"};
     run();
 }
 
-TEST_F(TApp, NeedsMixedFlags) {
+TEST_CASE_METHOD(TApp, "NeedsMixedFlags", "[app]") {
     CLI::Option *opt1 = app.add_flag("--opt1");
     app.add_flag("--opt2");
     app.add_flag("--opt3");
@@ -1713,19 +1711,19 @@ TEST_F(TApp, NeedsMixedFlags) {
     run();
 
     args = {"--optall"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt2", "--opt1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--optall", "--opt1", "--opt2", "--opt3"};
     run();
 }
 
-TEST_F(TApp, NeedsChainedFlags) {
+TEST_CASE_METHOD(TApp, "NeedsChainedFlags", "[app]") {
     CLI::Option *opt1 = app.add_flag("--opt1");
     CLI::Option *opt2 = app.add_flag("--opt2")->needs(opt1);
     app.add_flag("--opt3")->needs(opt2);
@@ -1736,16 +1734,16 @@ TEST_F(TApp, NeedsChainedFlags) {
     run();
 
     args = {"--opt2"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--opt3"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--opt3", "--opt2"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--opt3", "--opt1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--opt2", "--opt1"};
     run();
@@ -1754,7 +1752,7 @@ TEST_F(TApp, NeedsChainedFlags) {
     run();
 }
 
-TEST_F(TApp, Env) {
+TEST_CASE_METHOD(TApp, "Env", "[app]") {
 
     put_env("CLI11_TEST_ENV_TMP", "2");
 
@@ -1763,18 +1761,18 @@ TEST_F(TApp, Env) {
 
     run();
 
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(1u, vopt->count());
+    CHECK(val == 2);
+    CHECK(vopt->count() == 1u);
 
     vopt->required();
     run();
 
     unset_env("CLI11_TEST_ENV_TMP");
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
 // curiously check if an environmental only option works
-TEST_F(TApp, EnvOnly) {
+TEST_CASE_METHOD(TApp, "EnvOnly", "[app]") {
 
     put_env("CLI11_TEST_ENV_TMP", "2");
 
@@ -1783,25 +1781,25 @@ TEST_F(TApp, EnvOnly) {
 
     run();
 
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(1u, vopt->count());
+    CHECK(val == 2);
+    CHECK(vopt->count() == 1u);
 
     vopt->required();
     run();
 
     unset_env("CLI11_TEST_ENV_TMP");
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(TApp, RangeInt) {
+TEST_CASE_METHOD(TApp, "RangeInt", "[app]") {
     int x{0};
     app.add_option("--one", x)->check(CLI::Range(3, 6));
 
     args = {"--one=1"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=7"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=3"};
     run();
@@ -1813,17 +1811,17 @@ TEST_F(TApp, RangeInt) {
     run();
 }
 
-TEST_F(TApp, RangeDouble) {
+TEST_CASE_METHOD(TApp, "RangeDouble", "[app]") {
 
     double x{0.0};
     /// Note that this must be a double in Range, too
     app.add_option("--one", x)->check(CLI::Range(3.0, 6.0));
 
     args = {"--one=1"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=7"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=3"};
     run();
@@ -1835,26 +1833,26 @@ TEST_F(TApp, RangeDouble) {
     run();
 }
 
-TEST_F(TApp, typeCheck) {
+TEST_CASE_METHOD(TApp, "typeCheck", "[app]") {
 
     /// Note that this must be a double in Range, too
     app.add_option("--one")->check(CLI::TypeValidator<unsigned int>());
 
     args = {"--one=1"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"--one=-7"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=error"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--one=4.568"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
 // Check to make sure programmatic access to left over is available
-TEST_F(TApp, AllowExtras) {
+TEST_CASE_METHOD(TApp, "AllowExtras", "[app]") {
 
     app.allow_extras();
 
@@ -1863,32 +1861,32 @@ TEST_F(TApp, AllowExtras) {
 
     args = {"-x", "-f"};
 
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(val);
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"-x"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(val);
+    CHECK(std::vector<std::string>({"-x"}) == app.remaining());
 }
 
-TEST_F(TApp, AllowExtrasOrder) {
+TEST_CASE_METHOD(TApp, "AllowExtrasOrder", "[app]") {
 
     app.allow_extras();
 
     args = {"-x", "-f"};
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"-x", "-f"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"-x", "-f"}) == app.remaining());
 
     std::vector<std::string> left_over = app.remaining();
     app.parse(left_over);
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"-f", "-x"}));
-    EXPECT_EQ(app.remaining_for_passthrough(), left_over);
+    CHECK(std::vector<std::string>({"-f", "-x"}) == app.remaining());
+    CHECK(left_over == app.remaining_for_passthrough());
 }
 
-TEST_F(TApp, AllowExtrasCascade) {
+TEST_CASE_METHOD(TApp, "AllowExtrasCascade", "[app]") {
 
     app.allow_extras();
 
     args = {"-x", "45", "-f", "27"};
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"-x", "45", "-f", "27"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"-x", "45", "-f", "27"}) == app.remaining());
 
     std::vector<std::string> left_over = app.remaining_for_passthrough();
 
@@ -1899,23 +1897,23 @@ TEST_F(TApp, AllowExtrasCascade) {
     capp.add_option("-f", v2);
 
     capp.parse(left_over);
-    EXPECT_EQ(v1, 45);
-    EXPECT_EQ(v2, 27);
+    CHECK(45 == v1);
+    CHECK(27 == v2);
 }
 // makes sure the error throws on the rValue version of the parse
-TEST_F(TApp, ExtrasErrorRvalueParse) {
+TEST_CASE_METHOD(TApp, "ExtrasErrorRvalueParse", "[app]") {
 
     args = {"-x", "45", "-f", "27"};
-    EXPECT_THROW(app.parse(std::vector<std::string>({"-x", "45", "-f", "27"})), CLI::ExtrasError);
+    CHECK_THROWS_AS(app.parse(std::vector<std::string>({"-x", "45", "-f", "27"})), CLI::ExtrasError);
 }
 
-TEST_F(TApp, AllowExtrasCascadeDirect) {
+TEST_CASE_METHOD(TApp, "AllowExtrasCascadeDirect", "[app]") {
 
     app.allow_extras();
 
     args = {"-x", "45", "-f", "27"};
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"-x", "45", "-f", "27"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"-x", "45", "-f", "27"}) == app.remaining());
 
     CLI::App capp{"cascade_program"};
     int v1{0};
@@ -1924,11 +1922,11 @@ TEST_F(TApp, AllowExtrasCascadeDirect) {
     capp.add_option("-f", v2);
 
     capp.parse(app.remaining_for_passthrough());
-    EXPECT_EQ(v1, 45);
-    EXPECT_EQ(v2, 27);
+    CHECK(45 == v1);
+    CHECK(27 == v2);
 }
 
-TEST_F(TApp, AllowExtrasArgModify) {
+TEST_CASE_METHOD(TApp, "AllowExtrasArgModify", "[app]") {
 
     int v1{0};
     int v2{0};
@@ -1937,88 +1935,89 @@ TEST_F(TApp, AllowExtrasArgModify) {
     args = {"27", "-f", "45", "-x"};
     auto cargs = args;
     app.parse(args);
-    EXPECT_EQ(args, std::vector<std::string>({"45", "-x"}));
+    CHECK(std::vector<std::string>({"45", "-x"}) == args);
 
     CLI::App capp{"cascade_program"};
 
     capp.add_option("-x", v1);
 
     capp.parse(args);
-    EXPECT_EQ(v1, 45);
-    EXPECT_EQ(v2, 27);
+    CHECK(45 == v1);
+    CHECK(27 == v2);
 }
 
 // Test horrible error
-TEST_F(TApp, CheckShortFail) {
+TEST_CASE_METHOD(TApp, "CheckShortFail", "[app]") {
     args = {"--two"};
 
-    EXPECT_THROW(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::SHORT), CLI::HorribleError);
+    CHECK_THROWS_AS(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::SHORT), CLI::HorribleError);
 }
 
 // Test horrible error
-TEST_F(TApp, CheckLongFail) {
+TEST_CASE_METHOD(TApp, "CheckLongFail", "[app]") {
     args = {"-t"};
 
-    EXPECT_THROW(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::LONG), CLI::HorribleError);
+    CHECK_THROWS_AS(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::LONG), CLI::HorribleError);
 }
 
 // Test horrible error
-TEST_F(TApp, CheckWindowsFail) {
+TEST_CASE_METHOD(TApp, "CheckWindowsFail", "[app]") {
     args = {"-t"};
 
-    EXPECT_THROW(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::WINDOWS), CLI::HorribleError);
+    CHECK_THROWS_AS(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::WINDOWS_STYLE),
+                    CLI::HorribleError);
 }
 
 // Test horrible error
-TEST_F(TApp, CheckOtherFail) {
+TEST_CASE_METHOD(TApp, "CheckOtherFail", "[app]") {
     args = {"-t"};
 
-    EXPECT_THROW(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::NONE), CLI::HorribleError);
+    CHECK_THROWS_AS(CLI::detail::AppFriend::parse_arg(&app, args, CLI::detail::Classifier::NONE), CLI::HorribleError);
 }
 
 // Test horrible error
-TEST_F(TApp, CheckSubcomFail) {
+TEST_CASE_METHOD(TApp, "CheckSubcomFail", "[app]") {
     args = {"subcom"};
 
-    EXPECT_THROW(CLI::detail::AppFriend::parse_subcommand(&app, args), CLI::HorribleError);
+    CHECK_THROWS_AS(CLI::detail::AppFriend::parse_subcommand(&app, args), CLI::HorribleError);
 }
 
-TEST_F(TApp, FallthroughParentFail) {
-    EXPECT_THROW(CLI::detail::AppFriend::get_fallthrough_parent(&app), CLI::HorribleError);
+TEST_CASE_METHOD(TApp, "FallthroughParentFail", "[app]") {
+    CHECK_THROWS_AS(CLI::detail::AppFriend::get_fallthrough_parent(&app), CLI::HorribleError);
 }
 
-TEST_F(TApp, FallthroughParents) {
+TEST_CASE_METHOD(TApp, "FallthroughParents", "[app]") {
     auto sub = app.add_subcommand("test");
-    EXPECT_EQ(CLI::detail::AppFriend::get_fallthrough_parent(sub), &app);
+    CHECK(&app == CLI::detail::AppFriend::get_fallthrough_parent(sub));
 
     auto ssub = sub->add_subcommand("sub2");
-    EXPECT_EQ(CLI::detail::AppFriend::get_fallthrough_parent(ssub), sub);
+    CHECK(sub == CLI::detail::AppFriend::get_fallthrough_parent(ssub));
 
     auto og1 = app.add_option_group("g1");
     auto og2 = og1->add_option_group("g2");
     auto og3 = og2->add_option_group("g3");
-    EXPECT_EQ(CLI::detail::AppFriend::get_fallthrough_parent(og3), &app);
+    CHECK(&app == CLI::detail::AppFriend::get_fallthrough_parent(og3));
 
     auto ogb1 = sub->add_option_group("g1");
     auto ogb2 = ogb1->add_option_group("g2");
     auto ogb3 = ogb2->add_option_group("g3");
-    EXPECT_EQ(CLI::detail::AppFriend::get_fallthrough_parent(ogb3), sub);
+    CHECK(sub == CLI::detail::AppFriend::get_fallthrough_parent(ogb3));
 
     ogb2->name("groupb");
-    EXPECT_EQ(CLI::detail::AppFriend::get_fallthrough_parent(ogb3), ogb2);
+    CHECK(ogb2 == CLI::detail::AppFriend::get_fallthrough_parent(ogb3));
 }
 
-TEST_F(TApp, OptionWithDefaults) {
+TEST_CASE_METHOD(TApp, "OptionWithDefaults", "[app]") {
     int someint{2};
     app.add_option("-a", someint)->capture_default_str();
 
     args = {"-a1", "-a2"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
 // Added to test ->transform
-TEST_F(TApp, OrderedModifyingTransforms) {
+TEST_CASE_METHOD(TApp, "OrderedModifyingTransforms", "[app]") {
     std::vector<std::string> val;
     auto m = app.add_option("-m", val);
     m->transform([](std::string x) { return x + "1"; });
@@ -2028,29 +2027,29 @@ TEST_F(TApp, OrderedModifyingTransforms) {
 
     run();
 
-    EXPECT_EQ(val, std::vector<std::string>({"one21", "two21"}));
+    CHECK(std::vector<std::string>({"one21", "two21"}) == val);
 }
 
-TEST_F(TApp, ThrowingTransform) {
+TEST_CASE_METHOD(TApp, "ThrowingTransform", "[app]") {
     std::string val;
     auto m = app.add_option("-m,--mess", val);
     m->transform([](std::string) -> std::string { throw CLI::ValidationError("My Message"); });
 
-    ASSERT_NO_THROW(run());
+    REQUIRE_NOTHROW(run());
 
     args = {"-mone"};
 
-    ASSERT_THROW(run(), CLI::ValidationError);
+    REQUIRE_THROWS_AS(run(), CLI::ValidationError);
 
     try {
         run();
     } catch(const CLI::ValidationError &e) {
-        EXPECT_EQ(e.what(), std::string("--mess: My Message"));
+        CHECK(std::string("--mess: My Message") == e.what());
     }
 }
 
 // This was added to make running a simple function on each item easier
-TEST_F(TApp, EachItem) {
+TEST_CASE_METHOD(TApp, "EachItem", "[app]") {
 
     std::vector<std::string> results;
     std::vector<std::string> dummy;
@@ -2063,35 +2062,35 @@ TEST_F(TApp, EachItem) {
 
     run();
 
-    EXPECT_EQ(results, dummy);
+    CHECK(dummy == results);
 }
 
 // #128
-TEST_F(TApp, RepeatingMultiArgumentOptions) {
+TEST_CASE_METHOD(TApp, "RepeatingMultiArgumentOptions", "[app]") {
     std::vector<std::string> entries;
     app.add_option("--entry", entries, "set a key and value")->type_name("KEY VALUE")->type_size(-2);
 
     args = {"--entry", "key1", "value1", "--entry", "key2", "value2"};
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(entries, std::vector<std::string>({"key1", "value1", "key2", "value2"}));
+    REQUIRE_NOTHROW(run());
+    CHECK(std::vector<std::string>({"key1", "value1", "key2", "value2"}) == entries);
 
     args.pop_back();
-    ASSERT_THROW(run(), CLI::ArgumentMismatch);
+    REQUIRE_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
 // #122
-TEST_F(TApp, EmptyOptionEach) {
+TEST_CASE_METHOD(TApp, "EmptyOptionEach", "[app]") {
     std::string q;
     app.add_option("--each")->each([&q](std::string s) { q = s; });
 
     args = {"--each", "that"};
     run();
 
-    EXPECT_EQ(q, "that");
+    CHECK("that" == q);
 }
 
 // #122
-TEST_F(TApp, EmptyOptionFail) {
+TEST_CASE_METHOD(TApp, "EmptyOptionFail", "[app]") {
     std::string q;
     app.add_option("--each");
 
@@ -2099,116 +2098,116 @@ TEST_F(TApp, EmptyOptionFail) {
     run();
 }
 
-TEST_F(TApp, BeforeRequirements) {
+TEST_CASE_METHOD(TApp, "BeforeRequirements", "[app]") {
     app.add_flag_function("-a", [](std::int64_t) { throw CLI::Success(); });
     app.add_flag_function("-b", [](std::int64_t) { throw CLI::CallForHelp(); });
 
     args = {"extra"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"-a", "extra"};
-    EXPECT_THROW(run(), CLI::Success);
+    CHECK_THROWS_AS(run(), CLI::Success);
 
     args = {"-b", "extra"};
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 
     // These run in definition order.
     args = {"-a", "-b", "extra"};
-    EXPECT_THROW(run(), CLI::Success);
+    CHECK_THROWS_AS(run(), CLI::Success);
 
     // Currently, the original order is not preserved when calling callbacks
     // args = {"-b", "-a", "extra"};
-    // EXPECT_THROW(run(), CLI::CallForHelp);
+    // CHECK_THROWS_AS (run(), CLI::CallForHelp);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse", "[app]") {
 
     std::vector<int> vals{1, 2, 3};
     args = {"--idx", "1,2,3"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
     std::vector<int> vals2;
     // check that the results vector gets the results in the same way
     opt->results(vals2);
-    EXPECT_EQ(vals2, vals);
+    CHECK(vals == vals2);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals)->delimiter(',')->capture_default_str();
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
 }
 
 // #209
-TEST_F(TApp, DefaultUserSepParse) {
+TEST_CASE_METHOD(TApp, "DefaultUserSepParse", "[app]") {
 
     std::vector<std::string> vals;
     args = {"--idx", "1 2 3", "4 5 6"};
     auto opt = app.add_option("--idx", vals, "");
     run();
-    EXPECT_EQ(vals, std::vector<std::string>({"1 2 3", "4 5 6"}));
+    CHECK(std::vector<std::string>({"1 2 3", "4 5 6"}) == vals);
     opt->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<std::string>({"1 2 3", "4 5 6"}));
+    CHECK(std::vector<std::string>({"1 2 3", "4 5 6"}) == vals);
 }
 
 // #209
-TEST_F(TApp, BadUserSepParse) {
+TEST_CASE_METHOD(TApp, "BadUserSepParse", "[app]") {
 
     std::vector<int> vals;
     app.add_option("--idx", vals);
 
     args = {"--idx", "1,2,3"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse2) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse2", "[app]") {
 
     std::vector<int> vals{1, 2, 3};
     args = {"--idx", "1,2,"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals, "")->delimiter(',')->capture_default_str();
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 
-TEST_F(TApp, CustomUserSepParseFunction) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParseFunction", "[app]") {
 
     std::vector<int> vals{1, 2, 3};
     args = {"--idx", "1,2,3"};
     app.add_option_function<std::vector<int>>("--idx", [&vals](std::vector<int> v) { vals = std::move(v); })
         ->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
 }
 
 // delimiter removal
-TEST_F(TApp, CustomUserSepParseToggle) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParseToggle", "[app]") {
 
     std::vector<std::string> vals;
     args = {"--idx", "1,2,3"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<std::string>({"1", "2", "3"}));
+    CHECK(std::vector<std::string>({"1", "2", "3"}) == vals);
     opt->delimiter('\0');
     run();
-    EXPECT_EQ(vals, std::vector<std::string>({"1,2,3"}));
+    CHECK(std::vector<std::string>({"1,2,3"}) == vals);
     opt->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<std::string>({"1", "2", "3"}));
+    CHECK(std::vector<std::string>({"1", "2", "3"}) == vals);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse3) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse3", "[app]") {
 
     std::vector<int> vals = {1, 2, 3};
     args = {"--idx",
@@ -2217,42 +2216,42 @@ TEST_F(TApp, CustomUserSepParse3) {
             "2"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
     app.remove_option(opt);
 
     app.add_option("--idx", vals, "", false)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse4) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse4", "[app]") {
 
     std::vector<int> vals;
     args = {"--idx", "1,    2"};
     auto opt = app.add_option("--idx", vals)->delimiter(',')->capture_default_str();
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 
 // #218
-TEST_F(TApp, CustomUserSepParse5) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse5", "[app]") {
 
     std::vector<std::string> bar;
     args = {"this", "is", "a", "test"};
     auto opt = app.add_option("bar", bar, "bar");
     run();
-    EXPECT_EQ(bar, std::vector<std::string>({"this", "is", "a", "test"}));
+    CHECK(std::vector<std::string>({"this", "is", "a", "test"}) == bar);
 
     app.remove_option(opt);
     args = {"this", "is", "a", "test"};
     app.add_option("bar", bar, "bar")->capture_default_str();
     run();
-    EXPECT_EQ(bar, std::vector<std::string>({"this", "is", "a", "test"}));
+    CHECK(std::vector<std::string>({"this", "is", "a", "test"}) == bar);
 }
diff --git a/packages/CLI11/tests/BoostOptionTypeTest.cpp b/packages/CLI11/tests/BoostOptionTypeTest.cpp
index 3ddd6ae459bded6af8ee6af1c216891d8c061001..2110bcc6540bd881bb4474b0d00df3e21f0582c0 100644
--- a/packages/CLI11/tests/BoostOptionTypeTest.cpp
+++ b/packages/CLI11/tests/BoostOptionTypeTest.cpp
@@ -15,137 +15,112 @@
 #include <string>
 #include <vector>
 
-#include "gmock/gmock.h"
-
-namespace boost {
-namespace container {
-
-template <class T> class TApp_container_single_boost : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_single_boost() : TApp() {}
-};
-
-using containerTypes_single_boost =
-    ::testing::Types<small_vector<int, 2>, small_vector<int, 3>, flat_set<int>, stable_vector<int>, slist<int>>;
-
-TYPED_TEST_SUITE(TApp_container_single_boost, containerTypes_single_boost, );
-
-TYPED_TEST(TApp_container_single_boost, containerInt_boost) {
-
-    auto &cv = TApp_container_single_boost<TypeParam>::cval;
-    CLI::Option *opt = (TApp::app).add_option("-v", cv);
-
-    TApp::args = {"-v", "1", "-1", "-v", "3", "-v", "-976"};
-    TApp::run();
-    EXPECT_EQ(4u, (TApp::app).count("-v"));
-    EXPECT_EQ(4u, cv.size());
+using namespace boost::container;
+
+TEMPLATE_TEST_CASE("Boost container single",
+                   "[boost][optional]",
+                   (small_vector<int, 2>),
+                   (small_vector<int, 3>),
+                   flat_set<int>,
+                   stable_vector<int>,
+                   slist<int>) {
+    TApp tapp;
+    TestType cv;
+    CLI::Option *opt = tapp.app.add_option("-v", cv);
+
+    tapp.args = {"-v", "1", "-1", "-v", "3", "-v", "-976"};
+    tapp.run();
+    CHECK(tapp.app.count("-v") == 4u);
+    CHECK(cv.size() == 4u);
     opt->check(CLI::PositiveNumber.application_index(0));
     opt->check((!CLI::PositiveNumber).application_index(1));
-    EXPECT_NO_THROW(TApp::run());
-    EXPECT_EQ(4u, cv.size());
+    CHECK_NOTHROW(tapp.run());
+    CHECK(cv.size() == 4u);
     // v[3] would be negative
     opt->check(CLI::PositiveNumber.application_index(3));
-    EXPECT_THROW(TApp::run(), CLI::ValidationError);
+    CHECK_THROWS_AS(tapp.run(), CLI::ValidationError);
 }
 
-template <class T> class TApp_container_pair_boost : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_pair_boost() : TApp() {}
-};
-
 using isp = std::pair<int, std::string>;
-using containerTypes_pair_boost = ::testing::
-    Types<stable_vector<isp>, small_vector<isp, 2>, flat_set<isp>, slist<isp>, vector<isp>, flat_map<int, std::string>>;
 
-TYPED_TEST_SUITE(TApp_container_pair_boost, containerTypes_pair_boost, );
+TEMPLATE_TEST_CASE("Boost container pair",
+                   "[boost][optional]",
+                   stable_vector<isp>,
+                   (small_vector<isp, 2>),
+                   flat_set<isp>,
+                   slist<isp>,
+                   vector<isp>,
+                   (flat_map<int, std::string>)) {
 
-TYPED_TEST(TApp_container_pair_boost, containerPair_boost) {
+    TApp tapp;
+    TestType cv;
 
-    auto &cv = TApp_container_pair_boost<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
+    tapp.app.add_option("--dict", cv);
 
-    TApp::args = {"--dict", "1", "str1", "--dict", "3", "str3"};
+    tapp.args = {"--dict", "1", "str1", "--dict", "3", "str3"};
 
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
+    tapp.run();
+    CHECK(2u == cv.size());
 
-    TApp::args = {"--dict", "1", "str1", "--dict", "3", "--dict", "-1", "str4"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 3u);
+    tapp.args = {"--dict", "1", "str1", "--dict", "3", "--dict", "-1", "str4"};
+    tapp.run();
+    CHECK(3u == cv.size());
 }
 
-template <class T> class TApp_container_tuple_boost : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_tuple_boost() : TApp() {}
-};
-
 using tup_obj = std::tuple<int, std::string, double>;
-using containerTypes_tuple_boost =
-    ::testing::Types<small_vector<tup_obj, 3>, stable_vector<tup_obj>, flat_set<tup_obj>, slist<tup_obj>>;
 
-TYPED_TEST_SUITE(TApp_container_tuple_boost, containerTypes_tuple_boost, );
+TEMPLATE_TEST_CASE("Boost container tuple",
+                   "[boost][optional]",
+                   (small_vector<tup_obj, 3>),
+                   stable_vector<tup_obj>,
+                   flat_set<tup_obj>,
+                   slist<tup_obj>) {
+    TApp tapp;
+    TestType cv;
 
-TYPED_TEST(TApp_container_tuple_boost, containerTuple_boost) {
+    tapp.app.add_option("--dict", cv);
 
-    auto &cv = TApp_container_tuple_boost<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
+    tapp.args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7"};
 
-    TApp::args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7"};
+    tapp.run();
+    CHECK(2u == cv.size());
 
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
-
-    TApp::args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7", "--dict", "-1", "str4", "-1.87"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 3u);
+    tapp.args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7", "--dict", "-1", "str4", "-1.87"};
+    tapp.run();
+    CHECK(3u == cv.size());
 }
 
 using icontainer1 = vector<int>;
 using icontainer2 = flat_set<int>;
 using icontainer3 = slist<int>;
-using containerTypes_container_boost = ::testing::Types<std::vector<icontainer1>,
-                                                        slist<icontainer1>,
-                                                        flat_set<icontainer1>,
-                                                        small_vector<icontainer1, 2>,
-                                                        std::vector<icontainer2>,
-                                                        slist<icontainer2>,
-                                                        flat_set<icontainer2>,
-                                                        stable_vector<icontainer2>,
-                                                        static_vector<icontainer3, 10>,
-                                                        slist<icontainer3>,
-                                                        flat_set<icontainer3>,
-                                                        static_vector<icontainer3, 10>>;
-
-template <class T> class TApp_container_container_boost : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_container_boost() : TApp() {}
-};
-
-TYPED_TEST_SUITE(TApp_container_container_boost, containerTypes_container_boost, );
-
-TYPED_TEST(TApp_container_container_boost, containerContainer_boost) {
-
-    auto &cv = TApp_container_container_boost<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
-
-    TApp::args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
-
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
-
-    TApp::args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "--dict",
-                  "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 4u);
-}
 
-}  // namespace container
-}  // namespace boost
+TEMPLATE_TEST_CASE("Boost container container",
+                   "[boost][optional]",
+                   std::vector<icontainer1>,
+                   slist<icontainer1>,
+                   flat_set<icontainer1>,
+                   (small_vector<icontainer1, 2>),
+                   std::vector<icontainer2>,
+                   slist<icontainer2>,
+                   flat_set<icontainer2>,
+                   stable_vector<icontainer2>,
+                   (static_vector<icontainer2, 10>),
+                   slist<icontainer3>,
+                   flat_set<icontainer3>,
+                   (static_vector<icontainer3, 10>)) {
+
+    TApp tapp;
+    TestType cv;
+
+    tapp.app.add_option("--dict", cv);
+
+    tapp.args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
+
+    tapp.run();
+    CHECK(2u == cv.size());
+
+    tapp.args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "--dict",
+                 "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
+    tapp.run();
+    CHECK(4u == cv.size());
+}
diff --git a/packages/CLI11/tests/CMakeLists.txt b/packages/CLI11/tests/CMakeLists.txt
index cce6e30b8f388309113bed5134a4221dc082e6fd..4a86923f6dd7b74a9f51d02785e9f060e0617129 100644
--- a/packages/CLI11/tests/CMakeLists.txt
+++ b/packages/CLI11/tests/CMakeLists.txt
@@ -1,8 +1,3 @@
-if(NOT EXISTS "${CLI11_SOURCE_DIR}/extern/googletest/CMakeLists.txt")
-    message(FATAL_ERROR "You have requested tests be built, but googletest is not downloaded. Please run:
-    git submodule update --init")
-endif()
-
 list(APPEND CMAKE_MODULE_PATH "${CLI11_SOURCE_DIR}/cmake")
 
 if(CLI11_SANITIZERS)
@@ -29,8 +24,6 @@ else()
     endmacro()
 endif()
 
-set(GOOGLE_TEST_INDIVIDUAL OFF)
-include(AddGoogletest)
 
 # Add boost to test boost::optional (currently explicitly requested)"
 option(CLI11_BOOST "Turn on boost test (currently may fail with Boost 1.70)" OFF)
@@ -70,8 +63,32 @@ endif()
 
 set(CLI11_MULTIONLY_TESTS TimerTest)
 
-# Only affects current directory, so safe
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_library(catch_main main.cpp)
+target_include_directories(catch_main PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+
+# Currently a required download; could be make to look for existing Catch2, but
+# that would require changing the includes. FetchContent would be better, but
+# requires newer CMake.
+
+set(url https://github.com/philsquared/Catch/releases/download/v2.13.4/catch.hpp)
+file(DOWNLOAD ${url} "${CMAKE_CURRENT_BINARY_DIR}/catch.hpp" STATUS status EXPECTED_HASH SHA256=6e0fa3dd160891a01c1f3b34e8bcd6e0140abe08eca022e390027f27dec2050b)
+list(GET status 0 error)
+if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+endif()
+target_include_directories(catch_main PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
+
+# Target must already exist
+macro(add_catch_test TESTNAME)
+    target_link_libraries(${TESTNAME} PUBLIC catch_main)
+
+    add_test(${TESTNAME} ${TESTNAME})
+    set_target_properties(${TESTNAME} PROPERTIES FOLDER "Tests")
+    if (CLI11_FORCE_LIBCXX)
+       set_property(TARGET ${T} APPEND_STRING
+         PROPERTY LINK_FLAGS -stdlib=libc++)
+     endif()
+endmacro()
 
 foreach(T IN LISTS CLI11_TESTS)
     if(CLI11_CUDA_TESTS)
@@ -86,12 +103,12 @@ foreach(T IN LISTS CLI11_TESTS)
         target_link_libraries(${T} PRIVATE CLI11_warnings)
     endif()
     target_link_libraries(${T} PRIVATE CLI11)
-    add_gtest(${T})
+    add_catch_test(${T})
 
     if(CLI11_SINGLE_FILE AND CLI11_SINGLE_FILE_TESTS)
         add_executable(${T}_Single ${T}.cpp)
         target_link_libraries(${T}_Single PRIVATE CLI11_SINGLE)
-        add_gtest(${T}_Single)
+        add_catch_test(${T}_Single)
         set_property(TARGET ${T}_Single PROPERTY FOLDER "Tests Single File")
     endif()
 endforeach()
@@ -100,7 +117,7 @@ foreach(T IN LISTS CLI11_MULTIONLY_TESTS)
     add_executable(${T} ${T}.cpp ${CLI11_headers})
     add_sanitizers(${T})
     target_link_libraries(${T} PUBLIC CLI11)
-    add_gtest(${T})
+    add_catch_test(${T})
 endforeach()
 
 # Add -Wno-deprecated-declarations to DeprecatedTest
@@ -123,7 +140,7 @@ target_link_libraries(link_test_1 PUBLIC CLI11)
 set_target_properties(link_test_1 PROPERTIES FOLDER "Tests")
 add_executable(link_test_2 link_test_2.cpp)
 target_link_libraries(link_test_2 PUBLIC CLI11 link_test_1)
-add_gtest(link_test_2)
+add_catch_test(link_test_2)
 if(CLI11_FORCE_LIBCXX)
     set_property(TARGET link_test_1 APPEND_STRING
         PROPERTY LINK_FLAGS -stdlib=libc++)
diff --git a/packages/CLI11/tests/ComplexTypeTest.cpp b/packages/CLI11/tests/ComplexTypeTest.cpp
index 43c6cd48bab35cdc2a2ad09e6e59ccbc712a9dd6..b9a5d4e51cfc2584b8f199ef9ced5c8be245f8a4 100644
--- a/packages/CLI11/tests/ComplexTypeTest.cpp
+++ b/packages/CLI11/tests/ComplexTypeTest.cpp
@@ -5,11 +5,11 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "app_helper.hpp"
-#include "gmock/gmock.h"
+
 #include <complex>
 #include <cstdint>
 
-using ::testing::HasSubstr;
+using Catch::Matchers::Contains;
 
 using cx = std::complex<double>;
 
@@ -33,7 +33,7 @@ add_option(CLI::App &app, std::string name, cx &variable, std::string descriptio
     return opt;
 }
 
-TEST_F(TApp, AddingComplexParser) {
+TEST_CASE_METHOD(TApp, "AddingComplexParser", "[complex]") {
 
     cx comp{0, 0};
     add_option(app, "-c,--complex", comp);
@@ -41,27 +41,27 @@ TEST_F(TApp, AddingComplexParser) {
 
     run();
 
-    EXPECT_DOUBLE_EQ(1.5, comp.real());
-    EXPECT_DOUBLE_EQ(2.5, comp.imag());
+    CHECK(comp.real() == Approx(1.5));
+    CHECK(comp.imag() == Approx(2.5));
 }
 
-TEST_F(TApp, DefaultedComplex) {
+TEST_CASE_METHOD(TApp, "DefaultedComplex", "[complex]") {
 
     cx comp{1, 2};
     add_option(app, "-c,--complex", comp, "", true);
     args = {"-c", "4", "3"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
 
-    EXPECT_DOUBLE_EQ(1, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
 // an example of custom complex number converter that can be used to add new parsing options
@@ -117,7 +117,7 @@ template <> bool lexical_cast<std::complex<double>>(const std::string &input, st
 }  // namespace detail
 }  // namespace CLI
 
-TEST_F(TApp, AddingComplexParserDetail) {
+TEST_CASE_METHOD(TApp, "AddingComplexParserDetail", "[complex]") {
 
     bool skip_tests = false;
     try {  // check if the library actually supports regex,  it is possible to link against a non working regex in the
@@ -131,7 +131,7 @@ TEST_F(TApp, AddingComplexParserDetail) {
         if(!rsearch) {
             skip_tests = true;
         } else {
-            EXPECT_EQ(m.size(), 9u);
+            CHECK(9u == m.size());
         }
 
     } catch(...) {
@@ -146,14 +146,14 @@ TEST_F(TApp, AddingComplexParserDetail) {
 
         run();
 
-        EXPECT_DOUBLE_EQ(1.5, comp.real());
-        EXPECT_DOUBLE_EQ(2.5, comp.imag());
+        CHECK(comp.real() == Approx(1.5));
+        CHECK(comp.imag() == Approx(2.5));
         args = {"-c", "1.5-2.5j"};
 
         run();
 
-        EXPECT_DOUBLE_EQ(1.5, comp.real());
-        EXPECT_DOUBLE_EQ(-2.5, comp.imag());
+        CHECK(comp.real() == Approx(1.5));
+        CHECK(comp.imag() == Approx(-2.5));
     }
 }
 #endif
@@ -170,7 +170,7 @@ class complex_new {
     double val2_{0.0};
 };
 
-TEST_F(TApp, newComplex) {
+TEST_CASE_METHOD(TApp, "newComplex", "[complex]") {
     complex_new cval;
     static_assert(CLI::detail::is_complex<complex_new>::value, "complex new does not register as a complex type");
     static_assert(CLI::detail::classify_object<complex_new>::value == CLI::detail::object_category::complex_number,
@@ -180,12 +180,12 @@ TEST_F(TApp, newComplex) {
 
     run();
 
-    EXPECT_DOUBLE_EQ(1.5, cval.real());
-    EXPECT_DOUBLE_EQ(2.5, cval.imag());
+    CHECK(cval.real() == Approx(1.5));
+    CHECK(cval.imag() == Approx(2.5));
     args = {"-c", "1.5-2.5j"};
 
     run();
 
-    EXPECT_DOUBLE_EQ(1.5, cval.real());
-    EXPECT_DOUBLE_EQ(-2.5, cval.imag());
+    CHECK(cval.real() == Approx(1.5));
+    CHECK(cval.imag() == Approx(-2.5));
 }
diff --git a/packages/CLI11/tests/ConfigFileTest.cpp b/packages/CLI11/tests/ConfigFileTest.cpp
index d8b19d142aff335f13e872d38113a405b54c840c..12fa88c10a3949c218b246c1b6a9ddd429f9f309 100644
--- a/packages/CLI11/tests/ConfigFileTest.cpp
+++ b/packages/CLI11/tests/ConfigFileTest.cpp
@@ -6,63 +6,61 @@
 
 #include "app_helper.hpp"
 
-#include "gmock/gmock.h"
 #include <cstdio>
 #include <sstream>
 
-using ::testing::HasSubstr;
-using ::testing::Not;
+using Catch::Matchers::Contains;
 
-TEST(StringBased, convert_arg_for_ini) {
+TEST_CASE("StringBased: convert_arg_for_ini", "[config]") {
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini(std::string{}), "\"\"");
+    CHECK("\"\"" == CLI::detail::convert_arg_for_ini(std::string{}));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("true"), "true");
+    CHECK("true" == CLI::detail::convert_arg_for_ini("true"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("nan"), "nan");
+    CHECK("nan" == CLI::detail::convert_arg_for_ini("nan"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("happy hippo"), "\"happy hippo\"");
+    CHECK("\"happy hippo\"" == CLI::detail::convert_arg_for_ini("happy hippo"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("47"), "47");
+    CHECK("47" == CLI::detail::convert_arg_for_ini("47"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("47.365225"), "47.365225");
+    CHECK("47.365225" == CLI::detail::convert_arg_for_ini("47.365225"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("+3.28e-25"), "+3.28e-25");
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("-22E14"), "-22E14");
+    CHECK("+3.28e-25" == CLI::detail::convert_arg_for_ini("+3.28e-25"));
+    CHECK("-22E14" == CLI::detail::convert_arg_for_ini("-22E14"));
 
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("a"), "'a'");
+    CHECK("'a'" == CLI::detail::convert_arg_for_ini("a"));
     // hex
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0x5461FAED"), "0x5461FAED");
+    CHECK("0x5461FAED" == CLI::detail::convert_arg_for_ini("0x5461FAED"));
     // hex fail
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0x5461FAEG"), "\"0x5461FAEG\"");
+    CHECK("\"0x5461FAEG\"" == CLI::detail::convert_arg_for_ini("0x5461FAEG"));
 
     // octal
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0o546123567"), "0o546123567");
+    CHECK("0o546123567" == CLI::detail::convert_arg_for_ini("0o546123567"));
     // octal fail
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0o546123587"), "\"0o546123587\"");
+    CHECK("\"0o546123587\"" == CLI::detail::convert_arg_for_ini("0o546123587"));
 
     // binary
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0b01101110010"), "0b01101110010");
+    CHECK("0b01101110010" == CLI::detail::convert_arg_for_ini("0b01101110010"));
     // binary fail
-    EXPECT_EQ(CLI::detail::convert_arg_for_ini("0b01102110010"), "\"0b01102110010\"");
+    CHECK("\"0b01102110010\"" == CLI::detail::convert_arg_for_ini("0b01102110010"));
 }
 
-TEST(StringBased, IniJoin) {
+TEST_CASE("StringBased: IniJoin", "[config]") {
     std::vector<std::string> items = {"one", "two", "three four"};
     std::string result = "\"one\" \"two\" \"three four\"";
 
-    EXPECT_EQ(CLI::detail::ini_join(items, ' ', '\0', '\0'), result);
+    CHECK(result == CLI::detail::ini_join(items, ' ', '\0', '\0'));
 
     result = "[\"one\", \"two\", \"three four\"]";
 
-    EXPECT_EQ(CLI::detail::ini_join(items), result);
+    CHECK(result == CLI::detail::ini_join(items));
 
     result = "{\"one\"; \"two\"; \"three four\"}";
 
-    EXPECT_EQ(CLI::detail::ini_join(items, ';', '{', '}'), result);
+    CHECK(result == CLI::detail::ini_join(items, ';', '{', '}'));
 }
 
-TEST(StringBased, First) {
+TEST_CASE("StringBased: First", "[config]") {
     std::stringstream ofile;
 
     ofile << "one=three\n";
@@ -72,16 +70,16 @@ TEST(StringBased, First) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(2u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
+    CHECK(output.size() == 2u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
 }
 
-TEST(StringBased, FirstWithComments) {
+TEST_CASE("StringBased: FirstWithComments", "[config]") {
     std::stringstream ofile;
 
     ofile << ";this is a comment\n";
@@ -93,16 +91,16 @@ TEST(StringBased, FirstWithComments) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(2u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
+    CHECK(output.size() == 2u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
 }
 
-TEST(StringBased, Quotes) {
+TEST_CASE("StringBased: Quotes", "[config]") {
     std::stringstream ofile;
 
     ofile << R"(one = "three")" << '\n';
@@ -113,19 +111,19 @@ TEST(StringBased, Quotes) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(3u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
-    EXPECT_EQ("five", output.at(2).name);
-    EXPECT_EQ(1u, output.at(2).inputs.size());
-    EXPECT_EQ("six and seven", output.at(2).inputs.at(0));
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
+    CHECK(output.at(2).name == "five");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "six and seven");
 }
 
-TEST(StringBased, Vector) {
+TEST_CASE("StringBased: Vector", "[config]") {
     std::stringstream ofile;
 
     ofile << "one = three\n";
@@ -136,21 +134,21 @@ TEST(StringBased, Vector) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(3u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
-    EXPECT_EQ("five", output.at(2).name);
-    EXPECT_EQ(3u, output.at(2).inputs.size());
-    EXPECT_EQ("six", output.at(2).inputs.at(0));
-    EXPECT_EQ("and", output.at(2).inputs.at(1));
-    EXPECT_EQ("seven", output.at(2).inputs.at(2));
-}
-
-TEST(StringBased, TomlVector) {
+    CHECK(output.size() == 3u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
+    CHECK(output.at(2).name == "five");
+    CHECK(output.at(2).inputs.size() == 3u);
+    CHECK(output.at(2).inputs.at(0) == "six");
+    CHECK(output.at(2).inputs.at(1) == "and");
+    CHECK(output.at(2).inputs.at(2) == "seven");
+}
+
+TEST_CASE("StringBased: TomlVector", "[config]") {
     std::stringstream ofile;
 
     ofile << "one = [three]\n";
@@ -166,32 +164,32 @@ TEST(StringBased, TomlVector) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(5u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
-    EXPECT_EQ("five", output.at(2).name);
-    EXPECT_EQ(3u, output.at(2).inputs.size());
-    EXPECT_EQ("six", output.at(2).inputs.at(0));
-    EXPECT_EQ("and", output.at(2).inputs.at(1));
-    EXPECT_EQ("seven", output.at(2).inputs.at(2));
-    EXPECT_EQ("eight", output.at(3).name);
-    EXPECT_EQ(4u, output.at(3).inputs.size());
-    EXPECT_EQ("nine", output.at(3).inputs.at(0));
-    EXPECT_EQ("ten", output.at(3).inputs.at(1));
-    EXPECT_EQ("eleven", output.at(3).inputs.at(2));
-    EXPECT_EQ("twelve", output.at(3).inputs.at(3));
-    EXPECT_EQ("one_more", output.at(4).name);
-    EXPECT_EQ(3u, output.at(4).inputs.size());
-    EXPECT_EQ("one", output.at(4).inputs.at(0));
-    EXPECT_EQ("two", output.at(4).inputs.at(1));
-    EXPECT_EQ("three", output.at(4).inputs.at(2));
-}
-
-TEST(StringBased, Spaces) {
+    CHECK(output.size() == 5u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
+    CHECK(output.at(2).name == "five");
+    CHECK(output.at(2).inputs.size() == 3u);
+    CHECK(output.at(2).inputs.at(0) == "six");
+    CHECK(output.at(2).inputs.at(1) == "and");
+    CHECK(output.at(2).inputs.at(2) == "seven");
+    CHECK(output.at(3).name == "eight");
+    CHECK(output.at(3).inputs.size() == 4u);
+    CHECK(output.at(3).inputs.at(0) == "nine");
+    CHECK(output.at(3).inputs.at(1) == "ten");
+    CHECK(output.at(3).inputs.at(2) == "eleven");
+    CHECK(output.at(3).inputs.at(3) == "twelve");
+    CHECK(output.at(4).name == "one_more");
+    CHECK(output.at(4).inputs.size() == 3u);
+    CHECK(output.at(4).inputs.at(0) == "one");
+    CHECK(output.at(4).inputs.at(1) == "two");
+    CHECK(output.at(4).inputs.at(2) == "three");
+}
+
+TEST_CASE("StringBased: Spaces", "[config]") {
     std::stringstream ofile;
 
     ofile << "one = three\n";
@@ -201,16 +199,16 @@ TEST(StringBased, Spaces) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(2u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(1).name);
-    EXPECT_EQ(1u, output.at(1).inputs.size());
-    EXPECT_EQ("four", output.at(1).inputs.at(0));
+    CHECK(output.size() == 2u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).name == "two");
+    CHECK(output.at(1).inputs.size() == 1u);
+    CHECK(output.at(1).inputs.at(0) == "four");
 }
 
-TEST(StringBased, Sections) {
+TEST_CASE("StringBased: Sections", "[config]") {
     std::stringstream ofile;
 
     ofile << "one=three\n";
@@ -221,18 +219,18 @@ TEST(StringBased, Sections) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(4u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("two", output.at(2).name);
-    EXPECT_EQ("second", output.at(2).parents.at(0));
-    EXPECT_EQ(1u, output.at(2).inputs.size());
-    EXPECT_EQ("four", output.at(2).inputs.at(0));
-    EXPECT_EQ("second.two", output.at(2).fullname());
+    CHECK(output.size() == 4u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(2).name == "two");
+    CHECK(output.at(2).parents.at(0) == "second");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "four");
+    CHECK(output.at(2).fullname() == "second.two");
 }
 
-TEST(StringBased, SpacesSections) {
+TEST_CASE("StringBased: SpacesSections", "[config]") {
     std::stringstream ofile;
 
     ofile << "one=three\n\n";
@@ -244,19 +242,19 @@ TEST(StringBased, SpacesSections) {
 
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
-    EXPECT_EQ(4u, output.size());
-    EXPECT_EQ("one", output.at(0).name);
-    EXPECT_EQ(1u, output.at(0).inputs.size());
-    EXPECT_EQ("three", output.at(0).inputs.at(0));
-    EXPECT_EQ("second", output.at(1).parents.at(0));
-    EXPECT_EQ("++", output.at(1).name);
-    EXPECT_EQ("two", output.at(2).name);
-    EXPECT_EQ(1u, output.at(2).parents.size());
-    EXPECT_EQ("second", output.at(2).parents.at(0));
-    EXPECT_EQ(1u, output.at(2).inputs.size());
-    EXPECT_EQ("four", output.at(2).inputs.at(0));
-    EXPECT_EQ("second", output.at(3).parents.at(0));
-    EXPECT_EQ("--", output.at(3).name);
+    CHECK(output.size() == 4u);
+    CHECK(output.at(0).name == "one");
+    CHECK(output.at(0).inputs.size() == 1u);
+    CHECK(output.at(0).inputs.at(0) == "three");
+    CHECK(output.at(1).parents.at(0) == "second");
+    CHECK(output.at(1).name == "++");
+    CHECK(output.at(2).name == "two");
+    CHECK(output.at(2).parents.size() == 1u);
+    CHECK(output.at(2).parents.at(0) == "second");
+    CHECK(output.at(2).inputs.size() == 1u);
+    CHECK(output.at(2).inputs.at(0) == "four");
+    CHECK(output.at(3).parents.at(0) == "second");
+    CHECK(output.at(3).name == "--");
 }
 
 // check function to make sure that open sections match close sections
@@ -284,7 +282,7 @@ bool checkSections(const std::vector<CLI::ConfigItem> &output) {
     }
     return open.empty();
 }
-TEST(StringBased, Layers) {
+TEST_CASE("StringBased: Layers", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -298,11 +296,11 @@ TEST(StringBased, Layers) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 4 openings and 4 closings
-    EXPECT_EQ(10u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 10u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, LayersSkip) {
+TEST_CASE("StringBased: LayersSkip", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -314,11 +312,11 @@ TEST(StringBased, LayersSkip) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 4 openings and 4 closings
-    EXPECT_EQ(10u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 10u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, LayersSkipOrdered) {
+TEST_CASE("StringBased: LayersSkipOrdered", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -330,11 +328,11 @@ TEST(StringBased, LayersSkipOrdered) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 4 openings and 4 closings
-    EXPECT_EQ(12u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 12u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, LayersChange) {
+TEST_CASE("StringBased: LayersChange", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -346,11 +344,11 @@ TEST(StringBased, LayersChange) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 3 openings and 3 closings
-    EXPECT_EQ(8u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 8u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, Layers2LevelChange) {
+TEST_CASE("StringBased: Layers2LevelChange", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -362,11 +360,11 @@ TEST(StringBased, Layers2LevelChange) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 5 openings and 5 closings
-    EXPECT_EQ(12u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 12u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, Layers3LevelChange) {
+TEST_CASE("StringBased: Layers3LevelChange", "[config]") {
     std::stringstream ofile;
 
     ofile << "[other.sub2.subsub.cmd]\n";
@@ -377,11 +375,11 @@ TEST(StringBased, Layers3LevelChange) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 1 flags and 7 openings and 7 closings
-    EXPECT_EQ(15u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 15u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, newSegment) {
+TEST_CASE("StringBased: newSegment", "[config]") {
     std::stringstream ofile;
 
     ofile << "[other.sub2.subsub.cmd]\n";
@@ -393,11 +391,11 @@ TEST(StringBased, newSegment) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 5 openings and 5 closings
-    EXPECT_EQ(12u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 12u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, LayersDirect) {
+TEST_CASE("StringBased: LayersDirect", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -409,11 +407,11 @@ TEST(StringBased, LayersDirect) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 2 flags and 4 openings and 4 closings
-    EXPECT_EQ(10u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 10u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, LayersComplex) {
+TEST_CASE("StringBased: LayersComplex", "[config]") {
     std::stringstream ofile;
 
     ofile << "simple = true\n\n";
@@ -429,15 +427,15 @@ TEST(StringBased, LayersComplex) {
     std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_config(ofile);
 
     // 4 flags and 6 openings and 6 closings
-    EXPECT_EQ(16u, output.size());
-    EXPECT_TRUE(checkSections(output));
+    CHECK(output.size() == 16u);
+    CHECK(checkSections(output));
 }
 
-TEST(StringBased, file_error) {
-    EXPECT_THROW(std::vector<CLI::ConfigItem> output = CLI::ConfigINI().from_file("nonexist_file"), CLI::FileError);
+TEST_CASE("StringBased: file_error", "[config]") {
+    CHECK_THROWS_AS(CLI::ConfigINI().from_file("nonexist_file"), CLI::FileError);
 }
 
-TEST_F(TApp, IniNotRequired) {
+TEST_CASE_METHOD(TApp, "IniNotRequired", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -459,22 +457,22 @@ TEST_F(TApp, IniNotRequired) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(99, two);
-    EXPECT_EQ(3, three);
+    CHECK(one == 1);
+    CHECK(two == 99);
+    CHECK(three == 3);
 
     one = two = three = 0;
     args = {"--one=1", "--two=2"};
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(app["--config"]->as<std::string>(), "TestIniTmp.ini");
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+    CHECK("TestIniTmp.ini" == app["--config"]->as<std::string>());
 }
 
-TEST_F(TApp, IniSuccessOnUnknownOption) {
+TEST_CASE_METHOD(TApp, "IniSuccessOnUnknownOption", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
 
     app.set_config("--config", tmpini);
@@ -489,10 +487,10 @@ TEST_F(TApp, IniSuccessOnUnknownOption) {
     int two{0};
     app.add_option("--two", two);
     run();
-    EXPECT_EQ(99, two);
+    CHECK(two == 99);
 }
 
-TEST_F(TApp, IniGetRemainingOption) {
+TEST_CASE_METHOD(TApp, "IniGetRemainingOption", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
 
     app.set_config("--config", tmpini);
@@ -508,12 +506,12 @@ TEST_F(TApp, IniGetRemainingOption) {
 
     int two{0};
     app.add_option("--two", two);
-    ASSERT_NO_THROW(run());
+    REQUIRE_NOTHROW(run());
     std::vector<std::string> ExpectedRemaining = {ExtraOption};
-    EXPECT_EQ(app.remaining(), ExpectedRemaining);
+    CHECK(ExpectedRemaining == app.remaining());
 }
 
-TEST_F(TApp, IniGetNoRemaining) {
+TEST_CASE_METHOD(TApp, "IniGetNoRemaining", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
 
     app.set_config("--config", tmpini);
@@ -526,26 +524,26 @@ TEST_F(TApp, IniGetNoRemaining) {
 
     int two{0};
     app.add_option("--two", two);
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(app.remaining().size(), 0u);
+    REQUIRE_NOTHROW(run());
+    CHECK(0u == app.remaining().size());
 }
 
-TEST_F(TApp, IniRequiredNoDefault) {
+TEST_CASE_METHOD(TApp, "IniRequiredNoDefault", "[config]") {
 
     app.set_config("--config")->required();
 
     int two{0};
     app.add_option("--two", two);
-    ASSERT_THROW(run(), CLI::FileError);
+    REQUIRE_THROWS_AS(run(), CLI::FileError);
 }
 
-TEST_F(TApp, IniNotRequiredNoDefault) {
+TEST_CASE_METHOD(TApp, "IniNotRequiredNoDefault", "[config]") {
 
     app.set_config("--config");
 
     int two{0};
     app.add_option("--two", two);
-    ASSERT_NO_THROW(run());
+    REQUIRE_NOTHROW(run());
 }
 
 /// Define a class for testing purposes that does bad things
@@ -557,7 +555,7 @@ class EvilConfig : public CLI::Config {
     virtual std::vector<CLI::ConfigItem> from_config(std::istream &) const { throw CLI::FileError("evil"); }
 };
 
-TEST_F(TApp, IniRequiredbadConfigurator) {
+TEST_CASE_METHOD(TApp, "IniRequiredbadConfigurator", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -572,10 +570,10 @@ TEST_F(TApp, IniRequiredbadConfigurator) {
     app.config_formatter(std::make_shared<EvilConfig>());
     int two{0};
     app.add_option("--two", two);
-    ASSERT_THROW(run(), CLI::FileError);
+    REQUIRE_THROWS_AS(run(), CLI::FileError);
 }
 
-TEST_F(TApp, IniNotRequiredbadConfigurator) {
+TEST_CASE_METHOD(TApp, "IniNotRequiredbadConfigurator", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -590,10 +588,10 @@ TEST_F(TApp, IniNotRequiredbadConfigurator) {
     app.config_formatter(std::make_shared<EvilConfig>());
     int two{0};
     app.add_option("--two", two);
-    ASSERT_NO_THROW(run());
+    REQUIRE_NOTHROW(run());
 }
 
-TEST_F(TApp, IniNotRequiredNotDefault) {
+TEST_CASE_METHOD(TApp, "IniNotRequiredNotDefault", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
     TempFile tmpini2{"TestIniTmp2.ini"};
@@ -620,19 +618,19 @@ TEST_F(TApp, IniNotRequiredNotDefault) {
     app.add_option("--three", three);
 
     run();
-    EXPECT_EQ(app["--config"]->as<std::string>(), tmpini.c_str());
-    EXPECT_EQ(99, two);
-    EXPECT_EQ(3, three);
+    CHECK(tmpini.c_str() == app["--config"]->as<std::string>());
+    CHECK(two == 99);
+    CHECK(three == 3);
 
     args = {"--config", tmpini2};
     run();
 
-    EXPECT_EQ(98, two);
-    EXPECT_EQ(4, three);
-    EXPECT_EQ(app.get_config_ptr()->as<std::string>(), tmpini2.c_str());
+    CHECK(two == 98);
+    CHECK(three == 4);
+    CHECK(tmpini2.c_str() == app.get_config_ptr()->as<std::string>());
 }
 
-TEST_F(TApp, MultiConfig) {
+TEST_CASE_METHOD(TApp, "MultiConfig", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
     TempFile tmpini2{"TestIniTmp2.ini"};
@@ -661,19 +659,19 @@ TEST_F(TApp, MultiConfig) {
     args = {"--config", tmpini2, "--config", tmpini};
     run();
 
-    EXPECT_EQ(99, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(55, one);
+    CHECK(two == 99);
+    CHECK(three == 3);
+    CHECK(one == 55);
 
     args = {"--config", tmpini, "--config", tmpini2};
     run();
 
-    EXPECT_EQ(99, two);
-    EXPECT_EQ(4, three);
-    EXPECT_EQ(55, one);
+    CHECK(two == 99);
+    CHECK(three == 4);
+    CHECK(one == 55);
 }
 
-TEST_F(TApp, MultiConfig_single) {
+TEST_CASE_METHOD(TApp, "MultiConfig_single", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
     TempFile tmpini2{"TestIniTmp2.ini"};
@@ -702,37 +700,37 @@ TEST_F(TApp, MultiConfig_single) {
     args = {"--config", tmpini2, "--config", tmpini};
     run();
 
-    EXPECT_EQ(99, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(0, one);
+    CHECK(two == 99);
+    CHECK(three == 3);
+    CHECK(one == 0);
 
     two = 0;
     args = {"--config", tmpini, "--config", tmpini2};
     run();
 
-    EXPECT_EQ(0, two);
-    EXPECT_EQ(4, three);
-    EXPECT_EQ(55, one);
+    CHECK(two == 0);
+    CHECK(three == 4);
+    CHECK(one == 55);
 }
 
-TEST_F(TApp, IniRequiredNotFound) {
+TEST_CASE_METHOD(TApp, "IniRequiredNotFound", "[config]") {
 
     std::string noini = "TestIniNotExist.ini";
     app.set_config("--config", noini, "", true);
 
-    EXPECT_THROW(run(), CLI::FileError);
+    CHECK_THROWS_AS(run(), CLI::FileError);
 }
 
-TEST_F(TApp, IniNotRequiredPassedNotFound) {
+TEST_CASE_METHOD(TApp, "IniNotRequiredPassedNotFound", "[config]") {
 
     std::string noini = "TestIniNotExist.ini";
     app.set_config("--config", "", "", false);
 
     args = {"--config", noini};
-    EXPECT_THROW(run(), CLI::FileError);
+    CHECK_THROWS_AS(run(), CLI::FileError);
 }
 
-TEST_F(TApp, IniOverwrite) {
+TEST_CASE_METHOD(TApp, "IniOverwrite", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
     {
@@ -751,10 +749,10 @@ TEST_F(TApp, IniOverwrite) {
 
     run();
 
-    EXPECT_EQ(99, two);
+    CHECK(two == 99);
 }
 
-TEST_F(TApp, IniRequired) {
+TEST_CASE_METHOD(TApp, "IniRequired", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -775,28 +773,28 @@ TEST_F(TApp, IniRequired) {
     args = {"--one=1"};
 
     run();
-    EXPECT_EQ(one, 1);
-    EXPECT_EQ(two, 99);
-    EXPECT_EQ(three, 3);
+    CHECK(1 == one);
+    CHECK(99 == two);
+    CHECK(3 == three);
 
     one = two = three = 0;
     args = {"--one=1", "--two=2"};
 
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(one, 1);
-    EXPECT_EQ(two, 2);
-    EXPECT_EQ(three, 3);
+    CHECK_NOTHROW(run());
+    CHECK(1 == one);
+    CHECK(2 == two);
+    CHECK(3 == three);
 
     args = {};
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--two=2"};
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(TApp, IniVector) {
+TEST_CASE_METHOD(TApp, "IniVector", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -815,10 +813,10 @@ TEST_F(TApp, IniVector) {
 
     run();
 
-    EXPECT_EQ(std::vector<int>({2, 3}), two);
-    EXPECT_EQ(std::vector<int>({1, 2, 3}), three);
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
 }
-TEST_F(TApp, TOMLVector) {
+TEST_CASE_METHOD(TApp, "TOMLVector", "[config]") {
 
     TempFile tmptoml{"TestTomlTmp.toml"};
 
@@ -838,11 +836,11 @@ TEST_F(TApp, TOMLVector) {
 
     run();
 
-    EXPECT_EQ(std::vector<int>({2, 3}), two);
-    EXPECT_EQ(std::vector<int>({1, 2, 3}), three);
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
 }
 
-TEST_F(TApp, ColonValueSep) {
+TEST_CASE_METHOD(TApp, "ColonValueSep", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -864,11 +862,11 @@ TEST_F(TApp, ColonValueSep) {
 
     run();
 
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
+    CHECK(two == 2);
+    CHECK(three == 3);
 }
 
-TEST_F(TApp, TOMLVectordirect) {
+TEST_CASE_METHOD(TApp, "TOMLVectordirect", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -890,11 +888,11 @@ TEST_F(TApp, TOMLVectordirect) {
 
     run();
 
-    EXPECT_EQ(std::vector<int>({2, 3}), two);
-    EXPECT_EQ(std::vector<int>({1, 2, 3}), three);
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
 }
 
-TEST_F(TApp, TOMLStringVector) {
+TEST_CASE_METHOD(TApp, "TOMLStringVector", "[config]") {
 
     TempFile tmptoml{"TestTomlTmp.toml"};
 
@@ -914,11 +912,11 @@ TEST_F(TApp, TOMLStringVector) {
 
     run();
 
-    EXPECT_EQ(std::vector<std::string>({"2", "3"}), two);
-    EXPECT_EQ(std::vector<std::string>({"1", "2", "3"}), three);
+    CHECK(two == std::vector<std::string>({"2", "3"}));
+    CHECK(three == std::vector<std::string>({"1", "2", "3"}));
 }
 
-TEST_F(TApp, IniVectorCsep) {
+TEST_CASE_METHOD(TApp, "IniVectorCsep", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -938,11 +936,11 @@ TEST_F(TApp, IniVectorCsep) {
 
     run();
 
-    EXPECT_EQ(std::vector<int>({2, 3}), two);
-    EXPECT_EQ(std::vector<int>({1, 2, 3}), three);
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
 }
 
-TEST_F(TApp, IniVectorMultiple) {
+TEST_CASE_METHOD(TApp, "IniVectorMultiple", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -965,11 +963,11 @@ TEST_F(TApp, IniVectorMultiple) {
 
     run();
 
-    EXPECT_EQ(std::vector<int>({2, 3}), two);
-    EXPECT_EQ(std::vector<int>({1, 2, 3}), three);
+    CHECK(two == std::vector<int>({2, 3}));
+    CHECK(three == std::vector<int>({1, 2, 3}));
 }
 
-TEST_F(TApp, IniLayered) {
+TEST_CASE_METHOD(TApp, "IniLayered", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -993,15 +991,15 @@ TEST_F(TApp, IniLayered) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
 
-    EXPECT_EQ(subcom->count(), 0U);
-    EXPECT_FALSE(*subcom);
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
 }
 
-TEST_F(TApp, IniLayeredDotSection) {
+TEST_CASE_METHOD(TApp, "IniLayeredDotSection", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1026,15 +1024,15 @@ TEST_F(TApp, IniLayeredDotSection) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
 
-    EXPECT_EQ(subcom->count(), 0U);
-    EXPECT_FALSE(*subcom);
+    CHECK(0U == subcom->count());
+    CHECK(!*subcom);
 }
 
-TEST_F(TApp, IniSubcommandConfigurable) {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurable", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1059,16 +1057,16 @@ TEST_F(TApp, IniSubcommandConfigurable) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
 
-    EXPECT_EQ(subcom->count(), 1U);
-    EXPECT_TRUE(*subcom);
-    EXPECT_TRUE(app.got_subcommand(subcom));
+    CHECK(1U == subcom->count());
+    CHECK(*subcom);
+    CHECK(app.got_subcommand(subcom));
 }
 
-TEST_F(TApp, IniSubcommandConfigurablePreParse) {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurablePreParse", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1098,18 +1096,18 @@ TEST_F(TApp, IniSubcommandConfigurablePreParse) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(0, four);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+    CHECK(four == 0);
 
-    EXPECT_EQ(parse_c.size(), 1U);
-    EXPECT_EQ(parse_c[0], 2U);
+    CHECK(1U == parse_c.size());
+    CHECK(2U == parse_c[0]);
 
-    EXPECT_EQ(subcom2->count(), 0U);
+    CHECK(0U == subcom2->count());
 }
 
-TEST_F(TApp, IniSubcommandConfigurableParseComplete) {
+TEST_CASE_METHOD(TApp, "IniSubcommandConfigurableParseComplete", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1142,19 +1140,19 @@ TEST_F(TApp, IniSubcommandConfigurableParseComplete) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(0, four);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+    CHECK(four == 0);
 
-    ASSERT_EQ(parse_c.size(), 2u);
-    EXPECT_EQ(parse_c[0], 68U);
-    EXPECT_EQ(parse_c[1], 58U);
-    EXPECT_EQ(subsubcom->count(), 1u);
-    EXPECT_EQ(subcom2->count(), 0u);
+    REQUIRE(2u == parse_c.size());
+    CHECK(68U == parse_c[0]);
+    CHECK(58U == parse_c[1]);
+    CHECK(1u == subsubcom->count());
+    CHECK(0u == subcom2->count());
 }
 
-TEST_F(TApp, IniSubcommandMultipleSections) {
+TEST_CASE_METHOD(TApp, "IniSubcommandMultipleSections", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1190,19 +1188,19 @@ TEST_F(TApp, IniSubcommandMultipleSections) {
 
     run();
 
-    EXPECT_EQ(1, one);
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(3, three);
-    EXPECT_EQ(4, four);
+    CHECK(one == 1);
+    CHECK(two == 2);
+    CHECK(three == 3);
+    CHECK(four == 4);
 
-    ASSERT_EQ(parse_c.size(), 2u);
-    EXPECT_EQ(parse_c[0], 68U);
-    EXPECT_EQ(parse_c[1], 58U);
-    EXPECT_EQ(subsubcom->count(), 1u);
-    EXPECT_EQ(subcom2->count(), 0u);  // not configurable but value is updated
+    REQUIRE(2u == parse_c.size());
+    CHECK(68U == parse_c[0]);
+    CHECK(58U == parse_c[1]);
+    CHECK(1u == subsubcom->count());
+    CHECK(0u == subcom2->count());
 }
 
-TEST_F(TApp, DuplicateSubcommandCallbacks) {
+TEST_CASE_METHOD(TApp, "DuplicateSubcommandCallbacks", "[config]") {
 
     TempFile tmptoml{"TesttomlTmp.toml"};
 
@@ -1219,14 +1217,14 @@ TEST_F(TApp, DuplicateSubcommandCallbacks) {
     int count{0};
     foo->callback([&count]() { ++count; });
     foo->immediate_callback();
-    EXPECT_TRUE(foo->get_immediate_callback());
+    CHECK(foo->get_immediate_callback());
     foo->configurable();
 
     run();
-    EXPECT_EQ(count, 3);
+    CHECK(3 == count);
 }
 
-TEST_F(TApp, IniFailure) {
+TEST_CASE_METHOD(TApp, "IniFailure", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1238,10 +1236,10 @@ TEST_F(TApp, IniFailure) {
         out << "val=1" << std::endl;
     }
 
-    EXPECT_THROW(run(), CLI::ConfigError);
+    CHECK_THROWS_AS(run(), CLI::ConfigError);
 }
 
-TEST_F(TApp, IniConfigurable) {
+TEST_CASE_METHOD(TApp, "IniConfigurable", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1255,11 +1253,11 @@ TEST_F(TApp, IniConfigurable) {
         out << "val=1" << std::endl;
     }
 
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(value);
+    REQUIRE_NOTHROW(run());
+    CHECK(value);
 }
 
-TEST_F(TApp, IniNotConfigurable) {
+TEST_CASE_METHOD(TApp, "IniNotConfigurable", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1273,10 +1271,10 @@ TEST_F(TApp, IniNotConfigurable) {
         out << "val=1" << std::endl;
     }
 
-    EXPECT_THROW(run(), CLI::ConfigError);
+    CHECK_THROWS_AS(run(), CLI::ConfigError);
 }
 
-TEST_F(TApp, IniSubFailure) {
+TEST_CASE_METHOD(TApp, "IniSubFailure", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1289,10 +1287,10 @@ TEST_F(TApp, IniSubFailure) {
         out << "val=1" << std::endl;
     }
 
-    EXPECT_THROW(run(), CLI::ConfigError);
+    CHECK_THROWS_AS(run(), CLI::ConfigError);
 }
 
-TEST_F(TApp, IniNoSubFailure) {
+TEST_CASE_METHOD(TApp, "IniNoSubFailure", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1304,10 +1302,10 @@ TEST_F(TApp, IniNoSubFailure) {
         out << "val=1" << std::endl;
     }
 
-    EXPECT_THROW(run(), CLI::ConfigError);
+    CHECK_THROWS_AS(run(), CLI::ConfigError);
 }
 
-TEST_F(TApp, IniFlagConvertFailure) {
+TEST_CASE_METHOD(TApp, "IniFlagConvertFailure", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1321,13 +1319,13 @@ TEST_F(TApp, IniFlagConvertFailure) {
     run();
     bool result{false};
     auto *opt = app.get_option("--flag");
-    EXPECT_THROW(opt->results(result), CLI::ConversionError);
+    CHECK_THROWS_AS(opt->results(result), CLI::ConversionError);
     std::string res;
     opt->results(res);
-    EXPECT_EQ(res, "moobook");
+    CHECK("moobook" == res);
 }
 
-TEST_F(TApp, IniFlagNumbers) {
+TEST_CASE_METHOD(TApp, "IniFlagNumbers", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1340,11 +1338,11 @@ TEST_F(TApp, IniFlagNumbers) {
         out << "flag=3" << std::endl;
     }
 
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(boo);
+    REQUIRE_NOTHROW(run());
+    CHECK(boo);
 }
 
-TEST_F(TApp, IniFlagDual) {
+TEST_CASE_METHOD(TApp, "IniFlagDual", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1358,10 +1356,10 @@ TEST_F(TApp, IniFlagDual) {
         out << "flag=1 1" << std::endl;
     }
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, IniShort) {
+TEST_CASE_METHOD(TApp, "IniShort", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1374,11 +1372,11 @@ TEST_F(TApp, IniShort) {
         out << "f=3" << std::endl;
     }
 
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(key, 3);
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
 }
 
-TEST_F(TApp, IniPositional) {
+TEST_CASE_METHOD(TApp, "IniPositional", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1391,11 +1389,11 @@ TEST_F(TApp, IniPositional) {
         out << "key=3" << std::endl;
     }
 
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(key, 3);
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
 }
 
-TEST_F(TApp, IniEnvironmental) {
+TEST_CASE_METHOD(TApp, "IniEnvironmental", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1408,11 +1406,11 @@ TEST_F(TApp, IniEnvironmental) {
         out << "CLI11_TEST_ENV_KEY_TMP=3" << std::endl;
     }
 
-    ASSERT_NO_THROW(run());
-    EXPECT_EQ(key, 3);
+    REQUIRE_NOTHROW(run());
+    CHECK(3 == key);
 }
 
-TEST_F(TApp, IniFlagText) {
+TEST_CASE_METHOD(TApp, "IniFlagText", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1433,13 +1431,13 @@ TEST_F(TApp, IniFlagText) {
 
     run();
 
-    EXPECT_TRUE(flag1);
-    EXPECT_TRUE(flag2);
-    EXPECT_FALSE(flag3);
-    EXPECT_TRUE(flag4);
+    CHECK(flag1);
+    CHECK(flag2);
+    CHECK(!flag3);
+    CHECK(flag4);
 }
 
-TEST_F(TApp, IniFlags) {
+TEST_CASE_METHOD(TApp, "IniFlags", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     app.set_config("--config", tmpini);
 
@@ -1461,13 +1459,13 @@ TEST_F(TApp, IniFlags) {
 
     run();
 
-    EXPECT_EQ(2, two);
-    EXPECT_TRUE(three);
-    EXPECT_TRUE(four);
-    EXPECT_TRUE(five);
+    CHECK(two == 2);
+    CHECK(three);
+    CHECK(four);
+    CHECK(five);
 }
 
-TEST_F(TApp, IniFalseFlags) {
+TEST_CASE_METHOD(TApp, "IniFalseFlags", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     app.set_config("--config", tmpini);
 
@@ -1489,13 +1487,13 @@ TEST_F(TApp, IniFalseFlags) {
 
     run();
 
-    EXPECT_EQ(-2, two);
-    EXPECT_FALSE(three);
-    EXPECT_TRUE(four);
-    EXPECT_TRUE(five);
+    CHECK(two == -2);
+    CHECK(!three);
+    CHECK(four);
+    CHECK(five);
 }
 
-TEST_F(TApp, IniFalseFlagsDef) {
+TEST_CASE_METHOD(TApp, "IniFalseFlagsDef", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     app.set_config("--config", tmpini);
 
@@ -1517,13 +1515,13 @@ TEST_F(TApp, IniFalseFlagsDef) {
 
     run();
 
-    EXPECT_EQ(-2, two);
-    EXPECT_TRUE(three);
-    EXPECT_FALSE(four);
-    EXPECT_TRUE(five);
+    CHECK(two == -2);
+    CHECK(three);
+    CHECK(!four);
+    CHECK(five);
 }
 
-TEST_F(TApp, IniFalseFlagsDefDisableOverrideError) {
+TEST_CASE_METHOD(TApp, "IniFalseFlagsDefDisableOverrideError", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     app.set_config("--config", tmpini);
 
@@ -1541,10 +1539,10 @@ TEST_F(TApp, IniFalseFlagsDefDisableOverrideError) {
     app.add_flag("!--four", four);
     app.add_flag("--five", five);
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, IniFalseFlagsDefDisableOverrideSuccess) {
+TEST_CASE_METHOD(TApp, "IniFalseFlagsDefDisableOverrideSuccess", "[config]") {
     TempFile tmpini{"TestIniTmp.ini"};
     app.set_config("--config", tmpini);
 
@@ -1563,12 +1561,12 @@ TEST_F(TApp, IniFalseFlagsDefDisableOverrideSuccess) {
 
     run();
 
-    EXPECT_EQ(2, two);
-    EXPECT_EQ(4, four);
-    EXPECT_EQ(15, val);
+    CHECK(two == 2);
+    CHECK(four == 4);
+    CHECK(val == 15);
 }
 
-TEST_F(TApp, TomlOutputSimple) {
+TEST_CASE_METHOD(TApp, "TomlOutputSimple", "[config]") {
 
     int v{0};
     app.add_option("--simple", v);
@@ -1578,10 +1576,10 @@ TEST_F(TApp, TomlOutputSimple) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("simple=3\n", str);
+    CHECK(str == "simple=3\n");
 }
 
-TEST_F(TApp, TomlOutputShort) {
+TEST_CASE_METHOD(TApp, "TomlOutputShort", "[config]") {
 
     int v{0};
     app.add_option("-s", v);
@@ -1591,10 +1589,10 @@ TEST_F(TApp, TomlOutputShort) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("s=3\n", str);
+    CHECK(str == "s=3\n");
 }
 
-TEST_F(TApp, TomlOutputPositional) {
+TEST_CASE_METHOD(TApp, "TomlOutputPositional", "[config]") {
 
     int v{0};
     app.add_option("pos", v);
@@ -1604,11 +1602,11 @@ TEST_F(TApp, TomlOutputPositional) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("pos=3\n", str);
+    CHECK(str == "pos=3\n");
 }
 
 // try the output with environmental only arguments
-TEST_F(TApp, TomlOutputEnvironmental) {
+TEST_CASE_METHOD(TApp, "TomlOutputEnvironmental", "[config]") {
 
     put_env("CLI11_TEST_ENV_TMP", "2");
 
@@ -1617,14 +1615,14 @@ TEST_F(TApp, TomlOutputEnvironmental) {
 
     run();
 
-    EXPECT_EQ(2, val);
+    CHECK(val == 2);
     std::string str = app.config_to_str();
-    EXPECT_EQ("CLI11_TEST_ENV_TMP=2\n", str);
+    CHECK(str == "CLI11_TEST_ENV_TMP=2\n");
 
     unset_env("CLI11_TEST_ENV_TMP");
 }
 
-TEST_F(TApp, TomlOutputNoConfigurable) {
+TEST_CASE_METHOD(TApp, "TomlOutputNoConfigurable", "[config]") {
 
     int v1{0}, v2{0};
     app.add_option("--simple", v1);
@@ -1635,10 +1633,10 @@ TEST_F(TApp, TomlOutputNoConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("simple=3\n", str);
+    CHECK(str == "simple=3\n");
 }
 
-TEST_F(TApp, TomlOutputShortSingleDescription) {
+TEST_CASE_METHOD(TApp, "TomlOutputShortSingleDescription", "[config]") {
     std::string flag = "some_flag";
     const std::string description = "Some short description.";
     app.add_flag("--" + flag, description);
@@ -1646,10 +1644,10 @@ TEST_F(TApp, TomlOutputShortSingleDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("# " + description + "\n" + flag + "=false\n"));
+    CHECK_THAT(str, Contains("# " + description + "\n" + flag + "=false\n"));
 }
 
-TEST_F(TApp, TomlOutputShortDoubleDescription) {
+TEST_CASE_METHOD(TApp, "TomlOutputShortDoubleDescription", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     const std::string description1 = "First description.";
@@ -1660,11 +1658,11 @@ TEST_F(TApp, TomlOutputShortDoubleDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(
-        str, HasSubstr("# " + description1 + "\n" + flag1 + "=false\n\n# " + description2 + "\n" + flag2 + "=false\n"));
+    std::string ans = "# " + description1 + "\n" + flag1 + "=false\n\n# " + description2 + "\n" + flag2 + "=false\n";
+    CHECK_THAT(str, Contains(ans));
 }
 
-TEST_F(TApp, TomlOutputGroups) {
+TEST_CASE_METHOD(TApp, "TomlOutputGroups", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     const std::string description1 = "First description.";
@@ -1675,11 +1673,11 @@ TEST_F(TApp, TomlOutputGroups) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
 }
 
-TEST_F(TApp, TomlOutputHiddenOptions) {
+TEST_CASE_METHOD(TApp, "TomlOutputHiddenOptions", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     double val{12.7};
@@ -1692,18 +1690,18 @@ TEST_F(TApp, TomlOutputHiddenOptions) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
-    EXPECT_THAT(str, HasSubstr("dval=12.7"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
+    CHECK_THAT(str, Contains("dval=12.7"));
     auto loc = str.find("dval=12.7");
     auto locg1 = str.find("group1");
-    EXPECT_GT(locg1, loc);
+    CHECK(loc < locg1);
     // make sure it doesn't come twice
     loc = str.find("dval=12.7", loc + 4);
-    EXPECT_EQ(loc, std::string::npos);
+    CHECK(std::string::npos == loc);
 }
 
-TEST_F(TApp, TomlOutputMultiLineDescription) {
+TEST_CASE_METHOD(TApp, "TomlOutputMultiLineDescription", "[config]") {
     std::string flag = "some_flag";
     const std::string description = "Some short description.\nThat has lines.";
     app.add_flag("--" + flag, description);
@@ -1711,12 +1709,12 @@ TEST_F(TApp, TomlOutputMultiLineDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("# Some short description.\n"));
-    EXPECT_THAT(str, HasSubstr("# That has lines.\n"));
-    EXPECT_THAT(str, HasSubstr(flag + "=false\n"));
+    CHECK_THAT(str, Contains("# Some short description.\n"));
+    CHECK_THAT(str, Contains("# That has lines.\n"));
+    CHECK_THAT(str, Contains(flag + "=false\n"));
 }
 
-TEST_F(TApp, TomlOutputOptionGroup) {
+TEST_CASE_METHOD(TApp, "TomlOutputOptionGroup", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     double val{12.7};
@@ -1730,22 +1728,22 @@ TEST_F(TApp, TomlOutputOptionGroup) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
-    EXPECT_THAT(str, HasSubstr("dval=12.7"));
-    EXPECT_THAT(str, HasSubstr("group3"));
-    EXPECT_THAT(str, HasSubstr("g3 desc"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
+    CHECK_THAT(str, Contains("dval=12.7"));
+    CHECK_THAT(str, Contains("group3"));
+    CHECK_THAT(str, Contains("g3 desc"));
     auto loc = str.find("dval=12.7");
     auto locg1 = str.find("group1");
     auto locg3 = str.find("group3");
-    EXPECT_LT(locg1, loc);
+    CHECK(loc > locg1);
     // make sure it doesn't come twice
     loc = str.find("dval=12.7", loc + 4);
-    EXPECT_EQ(loc, std::string::npos);
-    EXPECT_GT(locg3, locg1);
+    CHECK(std::string::npos == loc);
+    CHECK(locg1 < locg3);
 }
 
-TEST_F(TApp, TomlOutputVector) {
+TEST_CASE_METHOD(TApp, "TomlOutputVector", "[config]") {
 
     std::vector<int> v;
     app.add_option("--vector", v);
@@ -1755,10 +1753,10 @@ TEST_F(TApp, TomlOutputVector) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("vector=[1, 2, 3]\n", str);
+    CHECK(str == "vector=[1, 2, 3]\n");
 }
 
-TEST_F(TApp, ConfigOutputVectorCustom) {
+TEST_CASE_METHOD(TApp, "ConfigOutputVectorCustom", "[config]") {
 
     std::vector<int> v;
     app.add_option("--vector", v);
@@ -1770,10 +1768,10 @@ TEST_F(TApp, ConfigOutputVectorCustom) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("vector:{1; 2; 3}\n", str);
+    CHECK(str == "vector:{1; 2; 3}\n");
 }
 
-TEST_F(TApp, TomlOutputFlag) {
+TEST_CASE_METHOD(TApp, "TomlOutputFlag", "[config]") {
 
     int v{0}, q{0};
     app.add_option("--simple", v);
@@ -1786,16 +1784,16 @@ TEST_F(TApp, TomlOutputFlag) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=3"));
-    EXPECT_THAT(str, Not(HasSubstr("nothing")));
-    EXPECT_THAT(str, HasSubstr("onething=true"));
-    EXPECT_THAT(str, HasSubstr("something=[true, true]"));
+    CHECK_THAT(str, Contains("simple=3"));
+    CHECK_THAT(str, !Contains("nothing"));
+    CHECK_THAT(str, Contains("onething=true"));
+    CHECK_THAT(str, Contains("something=[true, true]"));
 
     str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("nothing"));
+    CHECK_THAT(str, Contains("nothing"));
 }
 
-TEST_F(TApp, TomlOutputSet) {
+TEST_CASE_METHOD(TApp, "TomlOutputSet", "[config]") {
 
     int v{0};
     app.add_option("--simple", v)->check(CLI::IsMember({1, 2, 3}));
@@ -1805,10 +1803,10 @@ TEST_F(TApp, TomlOutputSet) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=2"));
+    CHECK_THAT(str, Contains("simple=2"));
 }
 
-TEST_F(TApp, TomlOutputDefault) {
+TEST_CASE_METHOD(TApp, "TomlOutputDefault", "[config]") {
 
     int v{7};
     app.add_option("--simple", v, "", true);
@@ -1816,13 +1814,13 @@ TEST_F(TApp, TomlOutputDefault) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, Not(HasSubstr("simple=7")));
+    CHECK_THAT(str, !Contains("simple=7"));
 
     str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("simple=7"));
+    CHECK_THAT(str, Contains("simple=7"));
 }
 
-TEST_F(TApp, TomlOutputSubcom) {
+TEST_CASE_METHOD(TApp, "TomlOutputSubcom", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other");
@@ -1832,11 +1830,11 @@ TEST_F(TApp, TomlOutputSubcom) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("other.newer=true"));
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other.newer=true"));
 }
 
-TEST_F(TApp, TomlOutputSubcomConfigurable) {
+TEST_CASE_METHOD(TApp, "TomlOutputSubcomConfigurable", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -1846,13 +1844,13 @@ TEST_F(TApp, TomlOutputSubcomConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other]"));
-    EXPECT_THAT(str, HasSubstr("newer=true"));
-    EXPECT_EQ(str.find("other.newer=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other]"));
+    CHECK_THAT(str, Contains("newer=true"));
+    CHECK(std::string::npos == str.find("other.newer=true"));
 }
 
-TEST_F(TApp, TomlOutputSubsubcom) {
+TEST_CASE_METHOD(TApp, "TomlOutputSubsubcom", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other");
@@ -1864,12 +1862,12 @@ TEST_F(TApp, TomlOutputSubsubcom) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("other.newer=true"));
-    EXPECT_THAT(str, HasSubstr("other.sub2.newest=true"));
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other.newer=true"));
+    CHECK_THAT(str, Contains("other.sub2.newest=true"));
 }
 
-TEST_F(TApp, TomlOutputSubsubcomConfigurable) {
+TEST_CASE_METHOD(TApp, "TomlOutputSubsubcomConfigurable", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -1882,15 +1880,15 @@ TEST_F(TApp, TomlOutputSubsubcomConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other]"));
-    EXPECT_THAT(str, HasSubstr("newer=true"));
-    EXPECT_THAT(str, HasSubstr("[other.sub2]"));
-    EXPECT_THAT(str, HasSubstr("newest=true"));
-    EXPECT_EQ(str.find("sub2.newest=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other]"));
+    CHECK_THAT(str, Contains("newer=true"));
+    CHECK_THAT(str, Contains("[other.sub2]"));
+    CHECK_THAT(str, Contains("newest=true"));
+    CHECK(std::string::npos == str.find("sub2.newest=true"));
 }
 
-TEST_F(TApp, TomlOutputSubsubcomConfigurableDeep) {
+TEST_CASE_METHOD(TApp, "TomlOutputSubsubcomConfigurableDeep", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -1907,13 +1905,13 @@ TEST_F(TApp, TomlOutputSubsubcomConfigurableDeep) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other.sub2.sub-level2.sub-level3]"));
-    EXPECT_THAT(str, HasSubstr("absolute_newest=true"));
-    EXPECT_EQ(str.find(".absolute_newest=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other.sub2.sub-level2.sub-level3]"));
+    CHECK_THAT(str, Contains("absolute_newest=true"));
+    CHECK(std::string::npos == str.find(".absolute_newest=true"));
 }
 
-TEST_F(TApp, TomlOutputQuoted) {
+TEST_CASE_METHOD(TApp, "TomlOutputQuoted", "[config]") {
 
     std::string val1;
     app.add_option("--val1", val1);
@@ -1925,15 +1923,15 @@ TEST_F(TApp, TomlOutputQuoted) {
 
     run();
 
-    EXPECT_EQ("I am a string", val1);
-    EXPECT_EQ("I am a \"confusing\" string", val2);
+    CHECK(val1 == "I am a string");
+    CHECK(val2 == "I am a \"confusing\" string");
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("val1=\"I am a string\""));
-    EXPECT_THAT(str, HasSubstr("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val1=\"I am a string\""));
+    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
 }
 
-TEST_F(TApp, DefaultsTomlOutputQuoted) {
+TEST_CASE_METHOD(TApp, "DefaultsTomlOutputQuoted", "[config]") {
 
     std::string val1{"I am a string"};
     app.add_option("--val1", val1, "", true);
@@ -1944,18 +1942,18 @@ TEST_F(TApp, DefaultsTomlOutputQuoted) {
     run();
 
     std::string str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("val1=\"I am a string\""));
-    EXPECT_THAT(str, HasSubstr("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val1=\"I am a string\""));
+    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
 }
 
 // #298
-TEST_F(TApp, StopReadingConfigOnClear) {
+TEST_CASE_METHOD(TApp, "StopReadingConfigOnClear", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
     app.set_config("--config", tmpini);
     auto ptr = app.set_config();  // Should *not* read config file
-    EXPECT_EQ(ptr, nullptr);
+    CHECK(nullptr == ptr);
 
     {
         std::ofstream out{tmpini};
@@ -1967,10 +1965,10 @@ TEST_F(TApp, StopReadingConfigOnClear) {
 
     run();
 
-    EXPECT_EQ(volume, 0);
+    CHECK(0 == volume);
 }
 
-TEST_F(TApp, ConfigWriteReadWrite) {
+TEST_CASE_METHOD(TApp, "ConfigWriteReadWrite", "[config]") {
 
     TempFile tmpini{"TestIniTmp.ini"};
 
@@ -1989,12 +1987,12 @@ TEST_F(TApp, ConfigWriteReadWrite) {
 
     std::string config2 = app.config_to_str(true, true);
 
-    EXPECT_EQ(config1, config2);
+    CHECK(config2 == config1);
 }
 
 /////// INI output tests
 
-TEST_F(TApp, IniOutputSimple) {
+TEST_CASE_METHOD(TApp, "IniOutputSimple", "[config]") {
 
     int v{0};
     app.add_option("--simple", v);
@@ -2004,10 +2002,10 @@ TEST_F(TApp, IniOutputSimple) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("simple=3\n", str);
+    CHECK(str == "simple=3\n");
 }
 
-TEST_F(TApp, IniOutputNoConfigurable) {
+TEST_CASE_METHOD(TApp, "IniOutputNoConfigurable", "[config]") {
 
     int v1{0}, v2{0};
     app.add_option("--simple", v1);
@@ -2018,10 +2016,10 @@ TEST_F(TApp, IniOutputNoConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("simple=3\n", str);
+    CHECK(str == "simple=3\n");
 }
 
-TEST_F(TApp, IniOutputShortSingleDescription) {
+TEST_CASE_METHOD(TApp, "IniOutputShortSingleDescription", "[config]") {
     std::string flag = "some_flag";
     const std::string description = "Some short description.";
     app.add_flag("--" + flag, description);
@@ -2029,10 +2027,10 @@ TEST_F(TApp, IniOutputShortSingleDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("; " + description + "\n" + flag + "=false\n"));
+    CHECK_THAT(str, Contains("; " + description + "\n" + flag + "=false\n"));
 }
 
-TEST_F(TApp, IniOutputShortDoubleDescription) {
+TEST_CASE_METHOD(TApp, "IniOutputShortDoubleDescription", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     const std::string description1 = "First description.";
@@ -2043,11 +2041,11 @@ TEST_F(TApp, IniOutputShortDoubleDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(
-        str, HasSubstr("; " + description1 + "\n" + flag1 + "=false\n\n; " + description2 + "\n" + flag2 + "=false\n"));
+    std::string ans = "; " + description1 + "\n" + flag1 + "=false\n\n; " + description2 + "\n" + flag2 + "=false\n";
+    CHECK_THAT(str, Contains(ans));
 }
 
-TEST_F(TApp, IniOutputGroups) {
+TEST_CASE_METHOD(TApp, "IniOutputGroups", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     const std::string description1 = "First description.";
@@ -2058,11 +2056,11 @@ TEST_F(TApp, IniOutputGroups) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
 }
 
-TEST_F(TApp, IniOutputHiddenOptions) {
+TEST_CASE_METHOD(TApp, "IniOutputHiddenOptions", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     double val{12.7};
@@ -2075,18 +2073,18 @@ TEST_F(TApp, IniOutputHiddenOptions) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
-    EXPECT_THAT(str, HasSubstr("dval=12.7"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
+    CHECK_THAT(str, Contains("dval=12.7"));
     auto loc = str.find("dval=12.7");
     auto locg1 = str.find("group1");
-    EXPECT_GT(locg1, loc);
+    CHECK(loc < locg1);
     // make sure it doesn't come twice
     loc = str.find("dval=12.7", loc + 4);
-    EXPECT_EQ(loc, std::string::npos);
+    CHECK(std::string::npos == loc);
 }
 
-TEST_F(TApp, IniOutputMultiLineDescription) {
+TEST_CASE_METHOD(TApp, "IniOutputMultiLineDescription", "[config]") {
     std::string flag = "some_flag";
     const std::string description = "Some short description.\nThat has lines.";
     app.add_flag("--" + flag, description);
@@ -2094,12 +2092,12 @@ TEST_F(TApp, IniOutputMultiLineDescription) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("; Some short description.\n"));
-    EXPECT_THAT(str, HasSubstr("; That has lines.\n"));
-    EXPECT_THAT(str, HasSubstr(flag + "=false\n"));
+    CHECK_THAT(str, Contains("; Some short description.\n"));
+    CHECK_THAT(str, Contains("; That has lines.\n"));
+    CHECK_THAT(str, Contains(flag + "=false\n"));
 }
 
-TEST_F(TApp, IniOutputOptionGroup) {
+TEST_CASE_METHOD(TApp, "IniOutputOptionGroup", "[config]") {
     std::string flag1 = "flagnr1";
     std::string flag2 = "flagnr2";
     double val{12.7};
@@ -2113,22 +2111,22 @@ TEST_F(TApp, IniOutputOptionGroup) {
     run();
 
     std::string str = app.config_to_str(true, true);
-    EXPECT_THAT(str, HasSubstr("group1"));
-    EXPECT_THAT(str, HasSubstr("group2"));
-    EXPECT_THAT(str, HasSubstr("dval=12.7"));
-    EXPECT_THAT(str, HasSubstr("group3"));
-    EXPECT_THAT(str, HasSubstr("g3 desc"));
+    CHECK_THAT(str, Contains("group1"));
+    CHECK_THAT(str, Contains("group2"));
+    CHECK_THAT(str, Contains("dval=12.7"));
+    CHECK_THAT(str, Contains("group3"));
+    CHECK_THAT(str, Contains("g3 desc"));
     auto loc = str.find("dval=12.7");
     auto locg1 = str.find("group1");
     auto locg3 = str.find("group3");
-    EXPECT_LT(locg1, loc);
+    CHECK(loc > locg1);
     // make sure it doesn't come twice
     loc = str.find("dval=12.7", loc + 4);
-    EXPECT_EQ(loc, std::string::npos);
-    EXPECT_GT(locg3, locg1);
+    CHECK(std::string::npos == loc);
+    CHECK(locg1 < locg3);
 }
 
-TEST_F(TApp, IniOutputVector) {
+TEST_CASE_METHOD(TApp, "IniOutputVector", "[config]") {
 
     std::vector<int> v;
     app.add_option("--vector", v);
@@ -2138,10 +2136,10 @@ TEST_F(TApp, IniOutputVector) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_EQ("vector=1 2 3\n", str);
+    CHECK(str == "vector=1 2 3\n");
 }
 
-TEST_F(TApp, IniOutputFlag) {
+TEST_CASE_METHOD(TApp, "IniOutputFlag", "[config]") {
 
     int v{0}, q{0};
     app.add_option("--simple", v);
@@ -2154,16 +2152,16 @@ TEST_F(TApp, IniOutputFlag) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=3"));
-    EXPECT_THAT(str, Not(HasSubstr("nothing")));
-    EXPECT_THAT(str, HasSubstr("onething=true"));
-    EXPECT_THAT(str, HasSubstr("something=true true"));
+    CHECK_THAT(str, Contains("simple=3"));
+    CHECK_THAT(str, !Contains("nothing"));
+    CHECK_THAT(str, Contains("onething=true"));
+    CHECK_THAT(str, Contains("something=true true"));
 
     str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("nothing"));
+    CHECK_THAT(str, Contains("nothing"));
 }
 
-TEST_F(TApp, IniOutputSet) {
+TEST_CASE_METHOD(TApp, "IniOutputSet", "[config]") {
 
     int v{0};
     app.add_option("--simple", v)->check(CLI::IsMember({1, 2, 3}));
@@ -2173,10 +2171,10 @@ TEST_F(TApp, IniOutputSet) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=2"));
+    CHECK_THAT(str, Contains("simple=2"));
 }
 
-TEST_F(TApp, IniOutputDefault) {
+TEST_CASE_METHOD(TApp, "IniOutputDefault", "[config]") {
 
     int v{7};
     app.add_option("--simple", v, "", true);
@@ -2184,13 +2182,13 @@ TEST_F(TApp, IniOutputDefault) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, Not(HasSubstr("simple=7")));
+    CHECK_THAT(str, !Contains("simple=7"));
 
     str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("simple=7"));
+    CHECK_THAT(str, Contains("simple=7"));
 }
 
-TEST_F(TApp, IniOutputSubcom) {
+TEST_CASE_METHOD(TApp, "IniOutputSubcom", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other");
@@ -2200,11 +2198,11 @@ TEST_F(TApp, IniOutputSubcom) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("other.newer=true"));
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other.newer=true"));
 }
 
-TEST_F(TApp, IniOutputSubcomConfigurable) {
+TEST_CASE_METHOD(TApp, "IniOutputSubcomConfigurable", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -2214,13 +2212,13 @@ TEST_F(TApp, IniOutputSubcomConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other]"));
-    EXPECT_THAT(str, HasSubstr("newer=true"));
-    EXPECT_EQ(str.find("other.newer=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other]"));
+    CHECK_THAT(str, Contains("newer=true"));
+    CHECK(std::string::npos == str.find("other.newer=true"));
 }
 
-TEST_F(TApp, IniOutputSubsubcom) {
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcom", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other");
@@ -2232,12 +2230,12 @@ TEST_F(TApp, IniOutputSubsubcom) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("other.newer=true"));
-    EXPECT_THAT(str, HasSubstr("other.sub2.newest=true"));
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("other.newer=true"));
+    CHECK_THAT(str, Contains("other.sub2.newest=true"));
 }
 
-TEST_F(TApp, IniOutputSubsubcomConfigurable) {
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcomConfigurable", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -2250,15 +2248,15 @@ TEST_F(TApp, IniOutputSubsubcomConfigurable) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other]"));
-    EXPECT_THAT(str, HasSubstr("newer=true"));
-    EXPECT_THAT(str, HasSubstr("[other.sub2]"));
-    EXPECT_THAT(str, HasSubstr("newest=true"));
-    EXPECT_EQ(str.find("sub2.newest=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other]"));
+    CHECK_THAT(str, Contains("newer=true"));
+    CHECK_THAT(str, Contains("[other.sub2]"));
+    CHECK_THAT(str, Contains("newest=true"));
+    CHECK(std::string::npos == str.find("sub2.newest=true"));
 }
 
-TEST_F(TApp, IniOutputSubsubcomConfigurableDeep) {
+TEST_CASE_METHOD(TApp, "IniOutputSubsubcomConfigurableDeep", "[config]") {
 
     app.add_flag("--simple");
     auto subcom = app.add_subcommand("other")->configurable();
@@ -2275,13 +2273,13 @@ TEST_F(TApp, IniOutputSubsubcomConfigurableDeep) {
     run();
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("simple=true"));
-    EXPECT_THAT(str, HasSubstr("[other.sub2.sub-level2.sub-level3]"));
-    EXPECT_THAT(str, HasSubstr("absolute_newest=true"));
-    EXPECT_EQ(str.find(".absolute_newest=true"), std::string::npos);
+    CHECK_THAT(str, Contains("simple=true"));
+    CHECK_THAT(str, Contains("[other.sub2.sub-level2.sub-level3]"));
+    CHECK_THAT(str, Contains("absolute_newest=true"));
+    CHECK(std::string::npos == str.find(".absolute_newest=true"));
 }
 
-TEST_F(TApp, IniOutputQuoted) {
+TEST_CASE_METHOD(TApp, "IniOutputQuoted", "[config]") {
 
     std::string val1;
     app.add_option("--val1", val1);
@@ -2293,15 +2291,15 @@ TEST_F(TApp, IniOutputQuoted) {
 
     run();
 
-    EXPECT_EQ("I am a string", val1);
-    EXPECT_EQ("I am a \"confusing\" string", val2);
+    CHECK(val1 == "I am a string");
+    CHECK(val2 == "I am a \"confusing\" string");
 
     std::string str = app.config_to_str();
-    EXPECT_THAT(str, HasSubstr("val1=\"I am a string\""));
-    EXPECT_THAT(str, HasSubstr("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val1=\"I am a string\""));
+    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
 }
 
-TEST_F(TApp, DefaultsIniOutputQuoted) {
+TEST_CASE_METHOD(TApp, "DefaultsIniOutputQuoted", "[config]") {
 
     std::string val1{"I am a string"};
     app.add_option("--val1", val1, "", true);
@@ -2312,6 +2310,6 @@ TEST_F(TApp, DefaultsIniOutputQuoted) {
     run();
 
     std::string str = app.config_to_str(true);
-    EXPECT_THAT(str, HasSubstr("val1=\"I am a string\""));
-    EXPECT_THAT(str, HasSubstr("val2='I am a \"confusing\" string'"));
+    CHECK_THAT(str, Contains("val1=\"I am a string\""));
+    CHECK_THAT(str, Contains("val2='I am a \"confusing\" string'"));
 }
diff --git a/packages/CLI11/tests/CreationTest.cpp b/packages/CLI11/tests/CreationTest.cpp
index 648c4aebba3de45e1e4e85232eafdd54e4c2ce0c..2a70f70d419552e96e13e0cf3a3c6f524a64ce68 100644
--- a/packages/CLI11/tests/CreationTest.cpp
+++ b/packages/CLI11/tests/CreationTest.cpp
@@ -7,221 +7,224 @@
 #include "app_helper.hpp"
 #include <cstdlib>
 
-TEST_F(TApp, AddingExistingShort) {
+TEST_CASE_METHOD(TApp, "AddingExistingShort", "[creation]") {
     CLI::Option *opt = app.add_flag("-c,--count");
-    EXPECT_EQ(opt->get_lnames(), std::vector<std::string>({"count"}));
-    EXPECT_EQ(opt->get_snames(), std::vector<std::string>({"c"}));
+    CHECK(std::vector<std::string>({"count"}) == opt->get_lnames());
+    CHECK(std::vector<std::string>({"c"}) == opt->get_snames());
 
-    EXPECT_THROW(app.add_flag("--cat,-c"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_flag("--cat,-c"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingLong) {
+TEST_CASE_METHOD(TApp, "AddingExistingLong", "[creation]") {
     app.add_flag("-q,--count");
-    EXPECT_THROW(app.add_flag("--count,-c"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_flag("--count,-c"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingShortNoCase) {
+TEST_CASE_METHOD(TApp, "AddingExistingShortNoCase", "[creation]") {
     app.add_flag("-C,--count")->ignore_case();
-    EXPECT_THROW(app.add_flag("--cat,-c"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_flag("--cat,-c"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingLongNoCase) {
+TEST_CASE_METHOD(TApp, "AddingExistingLongNoCase", "[creation]") {
     app.add_flag("-q,--count")->ignore_case();
-    EXPECT_THROW(app.add_flag("--Count,-c"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_flag("--Count,-c"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingNoCaseReversed) {
+TEST_CASE_METHOD(TApp, "AddingExistingNoCaseReversed", "[creation]") {
     app.add_flag("-c,--count")->ignore_case();
-    EXPECT_THROW(app.add_flag("--cat,-C"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_flag("--cat,-C"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingWithCase) {
+TEST_CASE_METHOD(TApp, "AddingExistingWithCase", "[creation]") {
     app.add_flag("-c,--count");
-    EXPECT_NO_THROW(app.add_flag("--Cat,-C"));
+    CHECK_NOTHROW(app.add_flag("--Cat,-C"));
 }
 
-TEST_F(TApp, AddingExistingWithCaseAfter) {
+TEST_CASE_METHOD(TApp, "AddingExistingWithCaseAfter", "[creation]") {
     auto count = app.add_flag("-c,--count");
     app.add_flag("--Cat,-C");
 
-    EXPECT_THROW(count->ignore_case(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(count->ignore_case(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingWithCaseAfter2) {
+TEST_CASE_METHOD(TApp, "AddingExistingWithCaseAfter2", "[creation]") {
     app.add_flag("-c,--count");
     auto cat = app.add_flag("--Cat,-C");
 
-    EXPECT_THROW(cat->ignore_case(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(cat->ignore_case(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingWithUnderscoreAfter) {
+TEST_CASE_METHOD(TApp, "AddingExistingWithUnderscoreAfter", "[creation]") {
     auto count = app.add_flag("--underscore");
     app.add_flag("--under_score");
 
-    EXPECT_THROW(count->ignore_underscore(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(count->ignore_underscore(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingExistingWithUnderscoreAfter2) {
+TEST_CASE_METHOD(TApp, "AddingExistingWithUnderscoreAfter2", "[creation]") {
     auto count = app.add_flag("--under_score");
     app.add_flag("--underscore");
 
-    EXPECT_THROW(count->ignore_underscore(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(count->ignore_underscore(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, AddingMultipleInfPositionals) {
+TEST_CASE_METHOD(TApp, "AddingMultipleInfPositionals", "[creation]") {
     std::vector<std::string> one, two;
     app.add_option("one", one);
     app.add_option("two", two);
 
-    EXPECT_THROW(run(), CLI::InvalidError);
+    CHECK_THROWS_AS(run(), CLI::InvalidError);
 }
 
-TEST_F(TApp, AddingMultipleInfPositionalsSubcom) {
+TEST_CASE_METHOD(TApp, "AddingMultipleInfPositionalsSubcom", "[creation]") {
     std::vector<std::string> one, two;
     CLI::App *below = app.add_subcommand("below");
     below->add_option("one", one);
     below->add_option("two", two);
 
-    EXPECT_THROW(run(), CLI::InvalidError);
+    CHECK_THROWS_AS(run(), CLI::InvalidError);
 }
 
-TEST_F(TApp, MultipleSubcomMatching) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatching", "[creation]") {
     app.add_subcommand("first");
     app.add_subcommand("second");
     app.add_subcommand("Second");
-    EXPECT_THROW(app.add_subcommand("first"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_subcommand("first"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, RecoverSubcommands) {
+TEST_CASE_METHOD(TApp, "RecoverSubcommands", "[creation]") {
     CLI::App *app1 = app.add_subcommand("app1");
     CLI::App *app2 = app.add_subcommand("app2");
     CLI::App *app3 = app.add_subcommand("app3");
     CLI::App *app4 = app.add_subcommand("app4");
 
-    EXPECT_EQ(app.get_subcommands({}), std::vector<CLI::App *>({app1, app2, app3, app4}));
+    CHECK(std::vector<CLI::App *>({app1, app2, app3, app4}) == app.get_subcommands({}));
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithCase) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithCase", "[creation]") {
     app.add_subcommand("first")->ignore_case();
-    EXPECT_THROW(app.add_subcommand("fIrst"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_subcommand("fIrst"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithCaseFirst) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithCaseFirst", "[creation]") {
     app.ignore_case();
     app.add_subcommand("first");
-    EXPECT_THROW(app.add_subcommand("fIrst"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_subcommand("fIrst"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithUnderscore) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithUnderscore", "[creation]") {
     app.add_subcommand("first_option")->ignore_underscore();
-    EXPECT_THROW(app.add_subcommand("firstoption"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_subcommand("firstoption"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithUnderscoreFirst) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithUnderscoreFirst", "[creation]") {
     app.ignore_underscore();
     app.add_subcommand("first_option");
-    EXPECT_THROW(app.add_subcommand("firstoption"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(app.add_subcommand("firstoption"), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithCaseInplace) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithCaseInplace", "[creation]") {
     app.add_subcommand("first");
     auto first = app.add_subcommand("fIrst");
 
-    EXPECT_THROW(first->ignore_case(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(first->ignore_case(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithCaseInplace2) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithCaseInplace2", "[creation]") {
     auto first = app.add_subcommand("first");
     app.add_subcommand("fIrst");
 
-    EXPECT_THROW(first->ignore_case(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(first->ignore_case(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithUnderscoreInplace) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithUnderscoreInplace", "[creation]") {
     app.add_subcommand("first_option");
     auto first = app.add_subcommand("firstoption");
 
-    EXPECT_THROW(first->ignore_underscore(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(first->ignore_underscore(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomMatchingWithUnderscoreInplace2) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomMatchingWithUnderscoreInplace2", "[creation]") {
     auto first = app.add_subcommand("firstoption");
     app.add_subcommand("first_option");
 
-    EXPECT_THROW(first->ignore_underscore(), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(first->ignore_underscore(), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, MultipleSubcomNoMatchingInplace2) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomNoMatchingInplace2", "[creation]") {
     auto first = app.add_subcommand("first");
     auto second = app.add_subcommand("second");
 
-    EXPECT_NO_THROW(first->ignore_case());
-    EXPECT_NO_THROW(second->ignore_case());
+    CHECK_NOTHROW(first->ignore_case());
+    CHECK_NOTHROW(second->ignore_case());
 }
 
-TEST_F(TApp, MultipleSubcomNoMatchingInplaceUnderscore2) {
+TEST_CASE_METHOD(TApp, "MultipleSubcomNoMatchingInplaceUnderscore2", "[creation]") {
     auto first = app.add_subcommand("first_option");
     auto second = app.add_subcommand("second_option");
 
-    EXPECT_NO_THROW(first->ignore_underscore());
-    EXPECT_NO_THROW(second->ignore_underscore());
+    CHECK_NOTHROW(first->ignore_underscore());
+    CHECK_NOTHROW(second->ignore_underscore());
 }
 
-TEST_F(TApp, IncorrectConstructionFlagPositional1) { EXPECT_THROW(app.add_flag("cat"), CLI::IncorrectConstruction); }
+TEST_CASE_METHOD(TApp, "IncorrectConstructionFlagPositional1", "[creation]") {
+    // This wants to be one line with clang-format
+    CHECK_THROWS_AS(app.add_flag("cat"), CLI::IncorrectConstruction);
+}
 
-TEST_F(TApp, IncorrectConstructionFlagPositional2) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionFlagPositional2", "[creation]") {
     int x{0};
-    EXPECT_THROW(app.add_flag("cat", x), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag("cat", x), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, IncorrectConstructionFlagPositional3) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionFlagPositional3", "[creation]") {
     bool x{false};
-    EXPECT_THROW(app.add_flag("cat", x), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag("cat", x), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, IncorrectConstructionNeedsCannotFind) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionNeedsCannotFind", "[creation]") {
     auto cat = app.add_flag("--cat");
-    EXPECT_THROW(cat->needs("--nothing"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(cat->needs("--nothing"), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, IncorrectConstructionExcludesCannotFind) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionExcludesCannotFind", "[creation]") {
     auto cat = app.add_flag("--cat");
-    EXPECT_THROW(cat->excludes("--nothing"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(cat->excludes("--nothing"), CLI::IncorrectConstruction);
 }
 
-TEST_F(TApp, IncorrectConstructionDuplicateNeeds) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionDuplicateNeeds", "[creation]") {
     auto cat = app.add_flag("--cat");
     auto other = app.add_flag("--other");
-    ASSERT_NO_THROW(cat->needs(other));
+    REQUIRE_NOTHROW(cat->needs(other));
     // duplicated needs is redundant but not an error
-    EXPECT_NO_THROW(cat->needs(other));
+    CHECK_NOTHROW(cat->needs(other));
 }
 
-TEST_F(TApp, IncorrectConstructionDuplicateNeedsTxt) {
+TEST_CASE_METHOD(TApp, "IncorrectConstructionDuplicateNeedsTxt", "[creation]") {
     auto cat = app.add_flag("--cat");
     app.add_flag("--other");
-    ASSERT_NO_THROW(cat->needs("--other"));
+    REQUIRE_NOTHROW(cat->needs("--other"));
     // duplicate needs is redundant but not an error
-    EXPECT_NO_THROW(cat->needs("--other"));
+    CHECK_NOTHROW(cat->needs("--other"));
 }
 
 // Now allowed
-TEST_F(TApp, CorrectConstructionDuplicateExcludes) {
+TEST_CASE_METHOD(TApp, "CorrectConstructionDuplicateExcludes", "[creation]") {
     auto cat = app.add_flag("--cat");
     auto other = app.add_flag("--other");
-    ASSERT_NO_THROW(cat->excludes(other));
-    ASSERT_NO_THROW(other->excludes(cat));
+    REQUIRE_NOTHROW(cat->excludes(other));
+    REQUIRE_NOTHROW(other->excludes(cat));
 }
 
 // Now allowed
-TEST_F(TApp, CorrectConstructionDuplicateExcludesTxt) {
+TEST_CASE_METHOD(TApp, "CorrectConstructionDuplicateExcludesTxt", "[creation]") {
     auto cat = app.add_flag("--cat");
     auto other = app.add_flag("--other");
-    ASSERT_NO_THROW(cat->excludes("--other"));
-    ASSERT_NO_THROW(other->excludes("--cat"));
+    REQUIRE_NOTHROW(cat->excludes("--other"));
+    REQUIRE_NOTHROW(other->excludes("--cat"));
 }
 
-TEST_F(TApp, CheckName) {
+TEST_CASE_METHOD(TApp, "CheckName", "[creation]") {
     auto long1 = app.add_flag("--long1");
     auto long2 = app.add_flag("--Long2");
     auto short1 = app.add_flag("-a");
@@ -230,26 +233,26 @@ TEST_F(TApp, CheckName) {
     auto pos1 = app.add_option("pos1", x);
     auto pos2 = app.add_option("pOs2", y);
 
-    EXPECT_TRUE(long1->check_name("--long1"));
-    EXPECT_FALSE(long1->check_name("--lonG1"));
+    CHECK(long1->check_name("--long1"));
+    CHECK(!long1->check_name("--lonG1"));
 
-    EXPECT_TRUE(long2->check_name("--Long2"));
-    EXPECT_FALSE(long2->check_name("--long2"));
+    CHECK(long2->check_name("--Long2"));
+    CHECK(!long2->check_name("--long2"));
 
-    EXPECT_TRUE(short1->check_name("-a"));
-    EXPECT_FALSE(short1->check_name("-A"));
+    CHECK(short1->check_name("-a"));
+    CHECK(!short1->check_name("-A"));
 
-    EXPECT_TRUE(short2->check_name("-B"));
-    EXPECT_FALSE(short2->check_name("-b"));
+    CHECK(short2->check_name("-B"));
+    CHECK(!short2->check_name("-b"));
 
-    EXPECT_TRUE(pos1->check_name("pos1"));
-    EXPECT_FALSE(pos1->check_name("poS1"));
+    CHECK(pos1->check_name("pos1"));
+    CHECK(!pos1->check_name("poS1"));
 
-    EXPECT_TRUE(pos2->check_name("pOs2"));
-    EXPECT_FALSE(pos2->check_name("pos2"));
+    CHECK(pos2->check_name("pOs2"));
+    CHECK(!pos2->check_name("pos2"));
 }
 
-TEST_F(TApp, CheckNameNoCase) {
+TEST_CASE_METHOD(TApp, "CheckNameNoCase", "[creation]") {
     auto long1 = app.add_flag("--long1")->ignore_case();
     auto long2 = app.add_flag("--Long2")->ignore_case();
     auto short1 = app.add_flag("-a")->ignore_case();
@@ -258,26 +261,26 @@ TEST_F(TApp, CheckNameNoCase) {
     auto pos1 = app.add_option("pos1", x)->ignore_case();
     auto pos2 = app.add_option("pOs2", y)->ignore_case();
 
-    EXPECT_TRUE(long1->check_name("--long1"));
-    EXPECT_TRUE(long1->check_name("--lonG1"));
+    CHECK(long1->check_name("--long1"));
+    CHECK(long1->check_name("--lonG1"));
 
-    EXPECT_TRUE(long2->check_name("--Long2"));
-    EXPECT_TRUE(long2->check_name("--long2"));
+    CHECK(long2->check_name("--Long2"));
+    CHECK(long2->check_name("--long2"));
 
-    EXPECT_TRUE(short1->check_name("-a"));
-    EXPECT_TRUE(short1->check_name("-A"));
+    CHECK(short1->check_name("-a"));
+    CHECK(short1->check_name("-A"));
 
-    EXPECT_TRUE(short2->check_name("-B"));
-    EXPECT_TRUE(short2->check_name("-b"));
+    CHECK(short2->check_name("-B"));
+    CHECK(short2->check_name("-b"));
 
-    EXPECT_TRUE(pos1->check_name("pos1"));
-    EXPECT_TRUE(pos1->check_name("poS1"));
+    CHECK(pos1->check_name("pos1"));
+    CHECK(pos1->check_name("poS1"));
 
-    EXPECT_TRUE(pos2->check_name("pOs2"));
-    EXPECT_TRUE(pos2->check_name("pos2"));
+    CHECK(pos2->check_name("pOs2"));
+    CHECK(pos2->check_name("pos2"));
 }
 
-TEST_F(TApp, CheckNameNoUnderscore) {
+TEST_CASE_METHOD(TApp, "CheckNameNoUnderscore", "[creation]") {
     auto long1 = app.add_flag("--longoption1")->ignore_underscore();
     auto long2 = app.add_flag("--long_option2")->ignore_underscore();
 
@@ -285,30 +288,30 @@ TEST_F(TApp, CheckNameNoUnderscore) {
     auto pos1 = app.add_option("pos_option_1", x)->ignore_underscore();
     auto pos2 = app.add_option("posoption2", y)->ignore_underscore();
 
-    EXPECT_TRUE(long1->check_name("--long_option1"));
-    EXPECT_TRUE(long1->check_name("--longoption_1"));
-    EXPECT_TRUE(long1->check_name("--longoption1"));
-    EXPECT_TRUE(long1->check_name("--long__opt_ion__1"));
-    EXPECT_TRUE(long1->check_name("--__l_o_n_g_o_p_t_i_o_n_1"));
+    CHECK(long1->check_name("--long_option1"));
+    CHECK(long1->check_name("--longoption_1"));
+    CHECK(long1->check_name("--longoption1"));
+    CHECK(long1->check_name("--long__opt_ion__1"));
+    CHECK(long1->check_name("--__l_o_n_g_o_p_t_i_o_n_1"));
 
-    EXPECT_TRUE(long2->check_name("--long_option2"));
-    EXPECT_TRUE(long2->check_name("--longoption2"));
-    EXPECT_TRUE(long2->check_name("--longoption_2"));
-    EXPECT_TRUE(long2->check_name("--long__opt_ion__2"));
-    EXPECT_TRUE(long2->check_name("--__l_o_n_go_p_t_i_o_n_2__"));
+    CHECK(long2->check_name("--long_option2"));
+    CHECK(long2->check_name("--longoption2"));
+    CHECK(long2->check_name("--longoption_2"));
+    CHECK(long2->check_name("--long__opt_ion__2"));
+    CHECK(long2->check_name("--__l_o_n_go_p_t_i_o_n_2__"));
 
-    EXPECT_TRUE(pos1->check_name("pos_option1"));
-    EXPECT_TRUE(pos1->check_name("pos_option_1"));
-    EXPECT_TRUE(pos1->check_name("pos_o_p_t_i_on_1"));
-    EXPECT_TRUE(pos1->check_name("posoption1"));
+    CHECK(pos1->check_name("pos_option1"));
+    CHECK(pos1->check_name("pos_option_1"));
+    CHECK(pos1->check_name("pos_o_p_t_i_on_1"));
+    CHECK(pos1->check_name("posoption1"));
 
-    EXPECT_TRUE(pos2->check_name("pos_option2"));
-    EXPECT_TRUE(pos2->check_name("pos_option_2"));
-    EXPECT_TRUE(pos2->check_name("pos_o_p_t_i_on_2"));
-    EXPECT_TRUE(pos2->check_name("posoption2"));
+    CHECK(pos2->check_name("pos_option2"));
+    CHECK(pos2->check_name("pos_option_2"));
+    CHECK(pos2->check_name("pos_o_p_t_i_on_2"));
+    CHECK(pos2->check_name("posoption2"));
 }
 
-TEST_F(TApp, CheckNameNoCaseNoUnderscore) {
+TEST_CASE_METHOD(TApp, "CheckNameNoCaseNoUnderscore", "[creation]") {
     auto long1 = app.add_flag("--LongoptioN1")->ignore_underscore()->ignore_case();
     auto long2 = app.add_flag("--long_Option2")->ignore_case()->ignore_underscore();
 
@@ -316,85 +319,85 @@ TEST_F(TApp, CheckNameNoCaseNoUnderscore) {
     auto pos1 = app.add_option("pos_Option_1", x)->ignore_underscore()->ignore_case();
     auto pos2 = app.add_option("posOption2", y)->ignore_case()->ignore_underscore();
 
-    EXPECT_TRUE(long1->check_name("--Long_Option1"));
-    EXPECT_TRUE(long1->check_name("--lONgoption_1"));
-    EXPECT_TRUE(long1->check_name("--LongOption1"));
-    EXPECT_TRUE(long1->check_name("--long__Opt_ion__1"));
-    EXPECT_TRUE(long1->check_name("--__l_o_N_g_o_P_t_i_O_n_1"));
+    CHECK(long1->check_name("--Long_Option1"));
+    CHECK(long1->check_name("--lONgoption_1"));
+    CHECK(long1->check_name("--LongOption1"));
+    CHECK(long1->check_name("--long__Opt_ion__1"));
+    CHECK(long1->check_name("--__l_o_N_g_o_P_t_i_O_n_1"));
 
-    EXPECT_TRUE(long2->check_name("--long_Option2"));
-    EXPECT_TRUE(long2->check_name("--LongOption2"));
-    EXPECT_TRUE(long2->check_name("--longOPTION_2"));
-    EXPECT_TRUE(long2->check_name("--long__OPT_ion__2"));
-    EXPECT_TRUE(long2->check_name("--__l_o_n_GO_p_t_i_o_n_2__"));
+    CHECK(long2->check_name("--long_Option2"));
+    CHECK(long2->check_name("--LongOption2"));
+    CHECK(long2->check_name("--longOPTION_2"));
+    CHECK(long2->check_name("--long__OPT_ion__2"));
+    CHECK(long2->check_name("--__l_o_n_GO_p_t_i_o_n_2__"));
 
-    EXPECT_TRUE(pos1->check_name("POS_Option1"));
-    EXPECT_TRUE(pos1->check_name("pos_option_1"));
-    EXPECT_TRUE(pos1->check_name("pos_o_p_t_i_on_1"));
-    EXPECT_TRUE(pos1->check_name("posoption1"));
+    CHECK(pos1->check_name("POS_Option1"));
+    CHECK(pos1->check_name("pos_option_1"));
+    CHECK(pos1->check_name("pos_o_p_t_i_on_1"));
+    CHECK(pos1->check_name("posoption1"));
 
-    EXPECT_TRUE(pos2->check_name("pos_option2"));
-    EXPECT_TRUE(pos2->check_name("pos_OPTION_2"));
-    EXPECT_TRUE(pos2->check_name("poS_o_p_T_I_on_2"));
-    EXPECT_TRUE(pos2->check_name("PosOption2"));
+    CHECK(pos2->check_name("pos_option2"));
+    CHECK(pos2->check_name("pos_OPTION_2"));
+    CHECK(pos2->check_name("poS_o_p_T_I_on_2"));
+    CHECK(pos2->check_name("PosOption2"));
 }
 
-TEST_F(TApp, PreSpaces) {
+TEST_CASE_METHOD(TApp, "PreSpaces", "[creation]") {
     int x{0};
     auto myapp = app.add_option(" -a, --long, other", x);
 
-    EXPECT_TRUE(myapp->check_lname("long"));
-    EXPECT_TRUE(myapp->check_sname("a"));
-    EXPECT_TRUE(myapp->check_name("other"));
+    CHECK(myapp->check_lname("long"));
+    CHECK(myapp->check_sname("a"));
+    CHECK(myapp->check_name("other"));
 }
 
-TEST_F(TApp, AllSpaces) {
+TEST_CASE_METHOD(TApp, "AllSpaces", "[creation]") {
     int x{0};
     auto myapp = app.add_option(" -a , --long , other ", x);
 
-    EXPECT_TRUE(myapp->check_lname("long"));
-    EXPECT_TRUE(myapp->check_sname("a"));
-    EXPECT_TRUE(myapp->check_name("other"));
+    CHECK(myapp->check_lname("long"));
+    CHECK(myapp->check_sname("a"));
+    CHECK(myapp->check_name("other"));
 }
 
-TEST_F(TApp, OptionFromDefaults) {
+TEST_CASE_METHOD(TApp, "OptionFromDefaults", "[creation]") {
     app.option_defaults()->required();
 
     // Options should remember defaults
     int x{0};
     auto opt = app.add_option("--simple", x);
-    EXPECT_TRUE(opt->get_required());
+    CHECK(opt->get_required());
 
     // Flags cannot be required
     auto flag = app.add_flag("--other");
-    EXPECT_FALSE(flag->get_required());
+    CHECK(!flag->get_required());
 
     app.option_defaults()->required(false);
     auto opt2 = app.add_option("--simple2", x);
-    EXPECT_FALSE(opt2->get_required());
+    CHECK(!opt2->get_required());
 
     app.option_defaults()->required()->ignore_case();
 
     auto opt3 = app.add_option("--simple3", x);
-    EXPECT_TRUE(opt3->get_required());
-    EXPECT_TRUE(opt3->get_ignore_case());
+    CHECK(opt3->get_required());
+    CHECK(opt3->get_ignore_case());
 
     app.option_defaults()->required()->ignore_underscore();
 
     auto opt4 = app.add_option("--simple4", x);
-    EXPECT_TRUE(opt4->get_required());
-    EXPECT_TRUE(opt4->get_ignore_underscore());
+    CHECK(opt4->get_required());
+    CHECK(opt4->get_ignore_underscore());
 }
 
-TEST_F(TApp, OptionFromDefaultsSubcommands) {
+TEST_CASE_METHOD(TApp, "OptionFromDefaultsSubcommands", "[creation]") {
     // Initial defaults
-    EXPECT_FALSE(app.option_defaults()->get_required());
-    EXPECT_EQ(app.option_defaults()->get_multi_option_policy(), CLI::MultiOptionPolicy::Throw);
-    EXPECT_FALSE(app.option_defaults()->get_ignore_case());
-    EXPECT_FALSE(app.option_defaults()->get_ignore_underscore());
-    EXPECT_FALSE(app.option_defaults()->get_disable_flag_override());
-    EXPECT_TRUE(app.option_defaults()->get_configurable());
-    EXPECT_EQ(app.option_defaults()->get_group(), "Options");
+    CHECK(!app.option_defaults()->get_required());
+    CHECK(CLI::MultiOptionPolicy::Throw == app.option_defaults()->get_multi_option_policy());
+    CHECK(!app.option_defaults()->get_ignore_case());
+    CHECK(!app.option_defaults()->get_ignore_underscore());
+    CHECK(!app.option_defaults()->get_disable_flag_override());
+    CHECK(app.option_defaults()->get_configurable());
+    CHECK("Options" == app.option_defaults()->get_group());
 
     app.option_defaults()
         ->required()
@@ -407,55 +410,55 @@ TEST_F(TApp, OptionFromDefaultsSubcommands) {
 
     auto app2 = app.add_subcommand("app2");
 
-    EXPECT_TRUE(app2->option_defaults()->get_required());
-    EXPECT_EQ(app2->option_defaults()->get_multi_option_policy(), CLI::MultiOptionPolicy::TakeLast);
-    EXPECT_TRUE(app2->option_defaults()->get_ignore_case());
-    EXPECT_TRUE(app2->option_defaults()->get_ignore_underscore());
-    EXPECT_FALSE(app2->option_defaults()->get_configurable());
-    EXPECT_TRUE(app.option_defaults()->get_disable_flag_override());
-    EXPECT_EQ(app2->option_defaults()->get_group(), "Something");
+    CHECK(app2->option_defaults()->get_required());
+    CHECK(CLI::MultiOptionPolicy::TakeLast == app2->option_defaults()->get_multi_option_policy());
+    CHECK(app2->option_defaults()->get_ignore_case());
+    CHECK(app2->option_defaults()->get_ignore_underscore());
+    CHECK(!app2->option_defaults()->get_configurable());
+    CHECK(app.option_defaults()->get_disable_flag_override());
+    CHECK("Something" == app2->option_defaults()->get_group());
 }
 
-TEST_F(TApp, GetNameCheck) {
+TEST_CASE_METHOD(TApp, "GetNameCheck", "[creation]") {
     int x{0};
     auto a = app.add_flag("--that");
     auto b = app.add_flag("-x");
     auto c = app.add_option("pos", x);
     auto d = app.add_option("one,-o,--other", x);
 
-    EXPECT_EQ(a->get_name(false, true), "--that");
-    EXPECT_EQ(b->get_name(false, true), "-x");
-    EXPECT_EQ(c->get_name(false, true), "pos");
+    CHECK("--that" == a->get_name(false, true));
+    CHECK("-x" == b->get_name(false, true));
+    CHECK("pos" == c->get_name(false, true));
 
-    EXPECT_EQ(d->get_name(), "--other");
-    EXPECT_EQ(d->get_name(false, false), "--other");
-    EXPECT_EQ(d->get_name(false, true), "-o,--other");
-    EXPECT_EQ(d->get_name(true, true), "one,-o,--other");
-    EXPECT_EQ(d->get_name(true, false), "one");
+    CHECK("--other" == d->get_name());
+    CHECK("--other" == d->get_name(false, false));
+    CHECK("-o,--other" == d->get_name(false, true));
+    CHECK("one,-o,--other" == d->get_name(true, true));
+    CHECK("one" == d->get_name(true, false));
 }
 
-TEST_F(TApp, SubcommandDefaults) {
+TEST_CASE_METHOD(TApp, "SubcommandDefaults", "[creation]") {
     // allow_extras, prefix_command, ignore_case, fallthrough, group, min/max subcommand, validate_positionals
 
     // Initial defaults
-    EXPECT_FALSE(app.get_allow_extras());
-    EXPECT_FALSE(app.get_prefix_command());
-    EXPECT_FALSE(app.get_immediate_callback());
-    EXPECT_FALSE(app.get_ignore_case());
-    EXPECT_FALSE(app.get_ignore_underscore());
+    CHECK(!app.get_allow_extras());
+    CHECK(!app.get_prefix_command());
+    CHECK(!app.get_immediate_callback());
+    CHECK(!app.get_ignore_case());
+    CHECK(!app.get_ignore_underscore());
 #ifdef _WIN32
-    EXPECT_TRUE(app.get_allow_windows_style_options());
+    CHECK(app.get_allow_windows_style_options());
 #else
-    EXPECT_FALSE(app.get_allow_windows_style_options());
+    CHECK(!app.get_allow_windows_style_options());
 #endif
-    EXPECT_FALSE(app.get_fallthrough());
-    EXPECT_FALSE(app.get_configurable());
-    EXPECT_FALSE(app.get_validate_positionals());
+    CHECK(!app.get_fallthrough());
+    CHECK(!app.get_configurable());
+    CHECK(!app.get_validate_positionals());
 
-    EXPECT_EQ(app.get_footer(), "");
-    EXPECT_EQ(app.get_group(), "Subcommands");
-    EXPECT_EQ(app.get_require_subcommand_min(), 0u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 0u);
+    CHECK("" == app.get_footer());
+    CHECK("Subcommands" == app.get_group());
+    CHECK(0u == app.get_require_subcommand_min());
+    CHECK(0u == app.get_require_subcommand_max());
 
     app.allow_extras();
     app.prefix_command();
@@ -478,57 +481,57 @@ TEST_F(TApp, SubcommandDefaults) {
     auto app2 = app.add_subcommand("app2");
 
     // Initial defaults
-    EXPECT_TRUE(app2->get_allow_extras());
-    EXPECT_TRUE(app2->get_prefix_command());
-    EXPECT_TRUE(app2->get_immediate_callback());
-    EXPECT_TRUE(app2->get_ignore_case());
-    EXPECT_TRUE(app2->get_ignore_underscore());
+    CHECK(app2->get_allow_extras());
+    CHECK(app2->get_prefix_command());
+    CHECK(app2->get_immediate_callback());
+    CHECK(app2->get_ignore_case());
+    CHECK(app2->get_ignore_underscore());
 #ifdef _WIN32
-    EXPECT_FALSE(app2->get_allow_windows_style_options());
+    CHECK(!app2->get_allow_windows_style_options());
 #else
-    EXPECT_TRUE(app2->get_allow_windows_style_options());
+    CHECK(app2->get_allow_windows_style_options());
 #endif
-    EXPECT_TRUE(app2->get_fallthrough());
-    EXPECT_TRUE(app2->get_validate_positionals());
-    EXPECT_TRUE(app2->get_configurable());
-    EXPECT_EQ(app2->get_footer(), "footy");
-    EXPECT_EQ(app2->get_group(), "Stuff");
-    EXPECT_EQ(app2->get_require_subcommand_min(), 0u);
-    EXPECT_EQ(app2->get_require_subcommand_max(), 3u);
+    CHECK(app2->get_fallthrough());
+    CHECK(app2->get_validate_positionals());
+    CHECK(app2->get_configurable());
+    CHECK("footy" == app2->get_footer());
+    CHECK("Stuff" == app2->get_group());
+    CHECK(0u == app2->get_require_subcommand_min());
+    CHECK(3u == app2->get_require_subcommand_max());
 }
 
-TEST_F(TApp, SubcommandMinMax) {
+TEST_CASE_METHOD(TApp, "SubcommandMinMax", "[creation]") {
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 0u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 0u);
+    CHECK(0u == app.get_require_subcommand_min());
+    CHECK(0u == app.get_require_subcommand_max());
 
     app.require_subcommand();
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 1u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 0u);
+    CHECK(1u == app.get_require_subcommand_min());
+    CHECK(0u == app.get_require_subcommand_max());
 
     app.require_subcommand(2);
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 2u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 2u);
+    CHECK(2u == app.get_require_subcommand_min());
+    CHECK(2u == app.get_require_subcommand_max());
 
     app.require_subcommand(0);
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 0u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 0u);
+    CHECK(0u == app.get_require_subcommand_min());
+    CHECK(0u == app.get_require_subcommand_max());
 
     app.require_subcommand(-2);
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 0u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 2u);
+    CHECK(0u == app.get_require_subcommand_min());
+    CHECK(2u == app.get_require_subcommand_max());
 
     app.require_subcommand(3, 7);
 
-    EXPECT_EQ(app.get_require_subcommand_min(), 3u);
-    EXPECT_EQ(app.get_require_subcommand_max(), 7u);
+    CHECK(3u == app.get_require_subcommand_min());
+    CHECK(7u == app.get_require_subcommand_max());
 }
 
-TEST_F(TApp, GetOptionList) {
+TEST_CASE_METHOD(TApp, "GetOptionList", "[creation]") {
     int two{0};
     auto flag = app.add_flag("--one");
     auto opt = app.add_option("--two", two);
@@ -536,36 +539,36 @@ TEST_F(TApp, GetOptionList) {
     const CLI::App &const_app = app;  // const alias to force use of const-methods
     std::vector<const CLI::Option *> opt_list = const_app.get_options();
 
-    ASSERT_EQ(opt_list.size(), static_cast<std::size_t>(3));
-    EXPECT_EQ(opt_list.at(1), flag);
-    EXPECT_EQ(opt_list.at(2), opt);
+    REQUIRE(static_cast<std::size_t>(3) == opt_list.size());
+    CHECK(flag == opt_list.at(1));
+    CHECK(opt == opt_list.at(2));
 
     std::vector<CLI::Option *> nonconst_opt_list = app.get_options();
     for(std::size_t i = 0; i < opt_list.size(); ++i) {
-        EXPECT_EQ(nonconst_opt_list.at(i), opt_list.at(i));
+        CHECK(opt_list.at(i) == nonconst_opt_list.at(i));
     }
 }
 
-TEST(ValidatorTests, TestValidatorCreation) {
+TEST_CASE("ValidatorTests: TestValidatorCreation", "[creation]") {
     std::function<std::string(std::string &)> op1 = [](std::string &val) {
         return (val.size() >= 5) ? std::string{} : val;
     };
     CLI::Validator V(op1, "", "size");
 
-    EXPECT_EQ(V.get_name(), "size");
+    CHECK("size" == V.get_name());
     V.name("harry");
-    EXPECT_EQ(V.get_name(), "harry");
-    EXPECT_TRUE(V.get_active());
+    CHECK("harry" == V.get_name());
+    CHECK(V.get_active());
 
-    EXPECT_EQ(V("test"), "test");
-    EXPECT_EQ(V("test5"), std::string{});
+    CHECK("test" == V("test"));
+    CHECK(std::string{} == V("test5"));
 
-    EXPECT_EQ(V.get_description(), std::string{});
+    CHECK(std::string{} == V.get_description());
     V.description("this is a description");
-    EXPECT_EQ(V.get_description(), "this is a description");
+    CHECK("this is a description" == V.get_description());
 }
 
-TEST(ValidatorTests, TestValidatorOps) {
+TEST_CASE("ValidatorTests: TestValidatorOps", "[creation]") {
     std::function<std::string(std::string &)> op1 = [](std::string &val) {
         return (val.size() >= 5) ? std::string{} : val;
     };
@@ -590,72 +593,72 @@ TEST(ValidatorTests, TestValidatorOps) {
     std::string eight(8, 'a');
     std::string nine(9, 'a');
     std::string ten(10, 'a');
-    EXPECT_TRUE(V1(five).empty());
-    EXPECT_FALSE(V1(four).empty());
+    CHECK(V1(five).empty());
+    CHECK(!V1(four).empty());
 
-    EXPECT_TRUE(V2(nine).empty());
-    EXPECT_FALSE(V2(eight).empty());
+    CHECK(V2(nine).empty());
+    CHECK(!V2(eight).empty());
 
-    EXPECT_TRUE(V3(two).empty());
-    EXPECT_FALSE(V3(four).empty());
+    CHECK(V3(two).empty());
+    CHECK(!V3(four).empty());
 
-    EXPECT_TRUE(V4(eight).empty());
-    EXPECT_FALSE(V4(ten).empty());
+    CHECK(V4(eight).empty());
+    CHECK(!V4(ten).empty());
 
     auto V1a2 = V1 & V2;
-    EXPECT_EQ(V1a2.get_description(), "(SIZE >= 5) AND (SIZE >= 9)");
-    EXPECT_FALSE(V1a2(five).empty());
-    EXPECT_TRUE(V1a2(nine).empty());
+    CHECK("(SIZE >= 5) AND (SIZE >= 9)" == V1a2.get_description());
+    CHECK(!V1a2(five).empty());
+    CHECK(V1a2(nine).empty());
 
     auto V1a4 = V1 & V4;
-    EXPECT_EQ(V1a4.get_description(), "(SIZE >= 5) AND (SIZE <= 9)");
-    EXPECT_TRUE(V1a4(five).empty());
-    EXPECT_TRUE(V1a4(eight).empty());
-    EXPECT_FALSE(V1a4(ten).empty());
-    EXPECT_FALSE(V1a4(four).empty());
+    CHECK("(SIZE >= 5) AND (SIZE <= 9)" == V1a4.get_description());
+    CHECK(V1a4(five).empty());
+    CHECK(V1a4(eight).empty());
+    CHECK(!V1a4(ten).empty());
+    CHECK(!V1a4(four).empty());
 
     auto V1o3 = V1 | V3;
-    EXPECT_EQ(V1o3.get_description(), "(SIZE >= 5) OR (SIZE < 3)");
-    EXPECT_TRUE(V1o3(two).empty());
-    EXPECT_TRUE(V1o3(eight).empty());
-    EXPECT_TRUE(V1o3(ten).empty());
-    EXPECT_TRUE(V1o3(two).empty());
-    EXPECT_FALSE(V1o3(four).empty());
+    CHECK("(SIZE >= 5) OR (SIZE < 3)" == V1o3.get_description());
+    CHECK(V1o3(two).empty());
+    CHECK(V1o3(eight).empty());
+    CHECK(V1o3(ten).empty());
+    CHECK(V1o3(two).empty());
+    CHECK(!V1o3(four).empty());
 
     auto m1 = V1o3 & V4;
-    EXPECT_EQ(m1.get_description(), "((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)");
-    EXPECT_TRUE(m1(two).empty());
-    EXPECT_TRUE(m1(eight).empty());
-    EXPECT_FALSE(m1(ten).empty());
-    EXPECT_TRUE(m1(two).empty());
-    EXPECT_TRUE(m1(five).empty());
-    EXPECT_FALSE(m1(four).empty());
+    CHECK("((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)" == m1.get_description());
+    CHECK(m1(two).empty());
+    CHECK(m1(eight).empty());
+    CHECK(!m1(ten).empty());
+    CHECK(m1(two).empty());
+    CHECK(m1(five).empty());
+    CHECK(!m1(four).empty());
 
     auto m2 = m1 & V2;
-    EXPECT_EQ(m2.get_description(), "(((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9)");
-    EXPECT_FALSE(m2(two).empty());
-    EXPECT_FALSE(m2(eight).empty());
-    EXPECT_FALSE(m2(ten).empty());
-    EXPECT_FALSE(m2(two).empty());
-    EXPECT_TRUE(m2(nine).empty());
-    EXPECT_FALSE(m2(four).empty());
+    CHECK("(((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9)" == m2.get_description());
+    CHECK(!m2(two).empty());
+    CHECK(!m2(eight).empty());
+    CHECK(!m2(ten).empty());
+    CHECK(!m2(two).empty());
+    CHECK(m2(nine).empty());
+    CHECK(!m2(four).empty());
 
     auto m3 = m2 | V3;
-    EXPECT_EQ(m3.get_description(), "((((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9)) OR (SIZE < 3)");
-    EXPECT_TRUE(m3(two).empty());
-    EXPECT_FALSE(m3(eight).empty());
-    EXPECT_TRUE(m3(nine).empty());
-    EXPECT_FALSE(m3(four).empty());
+    CHECK("((((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9)) OR (SIZE < 3)" == m3.get_description());
+    CHECK(m3(two).empty());
+    CHECK(!m3(eight).empty());
+    CHECK(m3(nine).empty());
+    CHECK(!m3(four).empty());
 
     auto m4 = V3 | m2;
-    EXPECT_EQ(m4.get_description(), "(SIZE < 3) OR ((((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9))");
-    EXPECT_TRUE(m4(two).empty());
-    EXPECT_FALSE(m4(eight).empty());
-    EXPECT_TRUE(m4(nine).empty());
-    EXPECT_FALSE(m4(four).empty());
+    CHECK("(SIZE < 3) OR ((((SIZE >= 5) OR (SIZE < 3)) AND (SIZE <= 9)) AND (SIZE >= 9))" == m4.get_description());
+    CHECK(m4(two).empty());
+    CHECK(!m4(eight).empty());
+    CHECK(m4(nine).empty());
+    CHECK(!m4(four).empty());
 }
 
-TEST(ValidatorTests, TestValidatorNegation) {
+TEST_CASE("ValidatorTests: TestValidatorNegation", "[creation]") {
 
     std::function<std::string(std::string &)> op1 = [](std::string &val) {
         return (val.size() >= 5) ? std::string{} : val;
@@ -666,21 +669,21 @@ TEST(ValidatorTests, TestValidatorNegation) {
     std::string four(4, 'a');
     std::string five(5, 'a');
 
-    EXPECT_TRUE(V1(five).empty());
-    EXPECT_FALSE(V1(four).empty());
+    CHECK(V1(five).empty());
+    CHECK(!V1(four).empty());
 
     auto V2 = !V1;
-    EXPECT_FALSE(V2(five).empty());
-    EXPECT_TRUE(V2(four).empty());
-    EXPECT_EQ(V2.get_description(), "NOT SIZE >= 5");
+    CHECK(!V2(five).empty());
+    CHECK(V2(four).empty());
+    CHECK("NOT SIZE >= 5" == V2.get_description());
 
     V2.active(false);
-    EXPECT_TRUE(V2(five).empty());
-    EXPECT_TRUE(V2(four).empty());
-    EXPECT_TRUE(V2.get_description().empty());
+    CHECK(V2(five).empty());
+    CHECK(V2(four).empty());
+    CHECK(V2.get_description().empty());
 }
 
-TEST(ValidatorTests, ValidatorDefaults) {
+TEST_CASE("ValidatorTests: ValidatorDefaults", "[creation]") {
 
     CLI::Validator V1{};
 
@@ -688,23 +691,23 @@ TEST(ValidatorTests, ValidatorDefaults) {
     std::string five(5, 'a');
 
     // make sure this doesn't generate a seg fault or something
-    EXPECT_TRUE(V1(five).empty());
-    EXPECT_TRUE(V1(four).empty());
+    CHECK(V1(five).empty());
+    CHECK(V1(four).empty());
 
-    EXPECT_TRUE(V1.get_name().empty());
-    EXPECT_TRUE(V1.get_description().empty());
-    EXPECT_TRUE(V1.get_active());
-    EXPECT_TRUE(V1.get_modifying());
+    CHECK(V1.get_name().empty());
+    CHECK(V1.get_description().empty());
+    CHECK(V1.get_active());
+    CHECK(V1.get_modifying());
 
     CLI::Validator V2{"check"};
     // make sure this doesn't generate a seg fault or something
-    EXPECT_TRUE(V2(five).empty());
-    EXPECT_TRUE(V2(four).empty());
+    CHECK(V2(five).empty());
+    CHECK(V2(four).empty());
 
-    EXPECT_TRUE(V2.get_name().empty());
-    EXPECT_EQ(V2.get_description(), "check");
-    EXPECT_TRUE(V2.get_active());
-    EXPECT_TRUE(V2.get_modifying());
+    CHECK(V2.get_name().empty());
+    CHECK("check" == V2.get_description());
+    CHECK(V2.get_active());
+    CHECK(V2.get_modifying());
     // This class only support streaming in, not out
 }
 
@@ -731,7 +734,7 @@ std::istream &operator>>(std::istream &in, Unstreamable &value) {
 static_assert(CLI::detail::is_istreamable<Unstreamable>::value,
               "Unstreamable type is still unstreamable and it should be");
 
-TEST_F(TApp, MakeUnstreamableOptions) {
+TEST_CASE_METHOD(TApp, "MakeUnstreamableOptions", "[creation]") {
     Unstreamable value;
     app.add_option("--value", value);
 
@@ -746,10 +749,10 @@ TEST_F(TApp, MakeUnstreamableOptions) {
 
     args = {"--value", "45"};
     run();
-    EXPECT_EQ(value.get_x(), 45);
+    CHECK(45 == value.get_x());
 
     args = {"--values", "45", "27", "34"};
     run();
-    EXPECT_EQ(values.size(), 3u);
-    EXPECT_EQ(values[2].get_x(), 34);
+    CHECK(3u == values.size());
+    CHECK(34 == values[2].get_x());
 }
diff --git a/packages/CLI11/tests/DeprecatedTest.cpp b/packages/CLI11/tests/DeprecatedTest.cpp
index a8f41971270021c57f50ff64dcb5f97387175791..cf9987c6d97b6a8251630f60deff8e5466138e28 100644
--- a/packages/CLI11/tests/DeprecatedTest.cpp
+++ b/packages/CLI11/tests/DeprecatedTest.cpp
@@ -6,157 +6,16 @@
 
 #include "app_helper.hpp"
 
-#include "gmock/gmock.h"
+using Catch::Matchers::Contains;
 
-using ::testing::HasSubstr;
-using ::testing::Not;
-
-TEST(Deprecated, Empty) {
+TEST_CASE("Deprecated: Empty", "[deprecated]") {
     // No deprecated features at this time.
-    EXPECT_TRUE(true);
+    CHECK(true);
 }
 
 // Classic sets
 
-TEST_F(TApp, SetWithDefaults) {
-    int someint = 2;
-    app.add_set("-a", someint, {1, 2, 3, 4}, "", true);
-
-    args = {"-a1", "-a2"};
-
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
-}
-
-TEST_F(TApp, SetWithDefaultsConversion) {
-    int someint = 2;
-    app.add_set("-a", someint, {1, 2, 3, 4}, "", true);
-
-    args = {"-a", "hi"};
-
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST_F(TApp, InSet) {
-
-    std::string choice;
-    app.add_set("-q,--quick", choice, {"one", "two", "three"});
-
-    args = {"--quick", "two"};
-
-    run();
-    EXPECT_EQ("two", choice);
-
-    args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST_F(TApp, InSetWithDefault) {
-
-    std::string choice = "one";
-    app.add_set("-q,--quick", choice, {"one", "two", "three"}, "", true);
-
-    run();
-    EXPECT_EQ("one", choice);
-
-    args = {"--quick", "two"};
-
-    run();
-    EXPECT_EQ("two", choice);
-
-    args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST_F(TApp, InIntSet) {
-
-    int choice;
-    app.add_set("-q,--quick", choice, {1, 2, 3});
-
-    args = {"--quick", "2"};
-
-    run();
-    EXPECT_EQ(2, choice);
-
-    args = {"--quick", "4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST_F(TApp, InIntSetWindows) {
-
-    int choice;
-    app.add_set("-q,--quick", choice, {1, 2, 3});
-    app.allow_windows_style_options();
-    args = {"/q", "2"};
-
-    run();
-    EXPECT_EQ(2, choice);
-
-    args = {"/q", "4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-
-    args = {"/q4"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
-}
-
-TEST_F(TApp, FailSet) {
-
-    int choice;
-    app.add_set("-q,--quick", choice, {1, 2, 3});
-
-    args = {"--quick", "3", "--quick=2"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
-
-    args = {"--quick=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST_F(TApp, FailMutableSet) {
-
-    int choice;
-    std::set<int> vals{1, 2, 3};
-    app.add_mutable_set("-q,--quick", choice, vals);
-    app.add_mutable_set("-s,--slow", choice, vals, "", true);
-
-    args = {"--quick=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-
-    args = {"--slow=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-// #113
-TEST_F(TApp, AddRemoveSetItems) {
-    std::set<std::string> items{"TYPE1", "TYPE2", "TYPE3", "TYPE4", "TYPE5"};
-
-    std::string type1, type2;
-    app.add_mutable_set("--type1", type1, items);
-    app.add_mutable_set("--type2", type2, items, "", true);
-
-    args = {"--type1", "TYPE1", "--type2", "TYPE2"};
-
-    run();
-    EXPECT_EQ(type1, "TYPE1");
-    EXPECT_EQ(type2, "TYPE2");
-
-    items.insert("TYPE6");
-    items.insert("TYPE7");
-
-    items.erase("TYPE1");
-    items.erase("TYPE2");
-
-    args = {"--type1", "TYPE6", "--type2", "TYPE7"};
-    run();
-    EXPECT_EQ(type1, "TYPE6");
-    EXPECT_EQ(type2, "TYPE7");
-
-    args = {"--type1", "TYPE1"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-
-    args = {"--type2", "TYPE2"};
-    EXPECT_THROW(run(), CLI::ValidationError);
-}
-
-TEST(THelp, Defaults) {
+TEST_CASE("THelp: Defaults", "[deprecated]") {
     CLI::App app{"My prog"};
 
     int one{1}, two{2};
@@ -165,24 +24,24 @@ TEST(THelp, Defaults) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("--one"));
-    EXPECT_THAT(help, HasSubstr("--set"));
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("=2"));
-    EXPECT_THAT(help, HasSubstr("2,3,4"));
+    CHECK_THAT(help, Contains("--one"));
+    CHECK_THAT(help, Contains("--set"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("=2"));
+    CHECK_THAT(help, Contains("2,3,4"));
 }
 
-TEST(THelp, VectorOpts) {
+TEST_CASE("THelp: VectorOpts", "[deprecated]") {
     CLI::App app{"My prog"};
     std::vector<int> x = {1, 2};
     app.add_option("-q,--quick", x, "", true);
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("INT=[1,2] ..."));
+    CHECK_THAT(help, Contains("INT=[1,2] ..."));
 }
 
-TEST(THelp, SetLower) {
+TEST_CASE("THelp: SetLower", "[deprecated]") {
     CLI::App app{"My prog"};
 
     std::string def{"One"};
@@ -190,14 +49,14 @@ TEST(THelp, SetLower) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("--set"));
-    EXPECT_THAT(help, HasSubstr("=One"));
-    EXPECT_THAT(help, HasSubstr("oNe"));
-    EXPECT_THAT(help, HasSubstr("twO"));
-    EXPECT_THAT(help, HasSubstr("THREE"));
+    CHECK_THAT(help, Contains("--set"));
+    CHECK_THAT(help, Contains("=One"));
+    CHECK_THAT(help, Contains("oNe"));
+    CHECK_THAT(help, Contains("twO"));
+    CHECK_THAT(help, Contains("THREE"));
 }
 
-TEST(THelp, ChangingSetDefaulted) {
+TEST_CASE("THelp: ChangingSetDefaulted", "[deprecated]") {
     CLI::App app;
 
     std::set<int> vals{1, 2, 3};
@@ -206,19 +65,19 @@ TEST(THelp, ChangingSetDefaulted) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert(4);
     vals.erase(1);
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
-TEST(THelp, ChangingCaselessSetDefaulted) {
+TEST_CASE("THelp: ChangingCaselessSetDefaulted", "[deprecated]") {
     CLI::App app;
 
     std::set<std::string> vals{"1", "2", "3"};
@@ -227,19 +86,19 @@ TEST(THelp, ChangingCaselessSetDefaulted) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert("4");
     vals.erase("1");
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
-TEST_F(TApp, DefaultOpts) {
+TEST_CASE_METHOD(TApp, "DefaultOpts", "[deprecated]") {
 
     int i = 3;
     std::string s = "HI";
@@ -251,116 +110,116 @@ TEST_F(TApp, DefaultOpts) {
 
     run();
 
-    EXPECT_EQ(1u, app.count("i"));
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(2, i);
-    EXPECT_EQ("9", s);
+    CHECK(app.count("i") == 1u);
+    CHECK(app.count("-s") == 1u);
+    CHECK(i == 2);
+    CHECK(s == "9");
 }
 
-TEST_F(TApp, VectorDefaultedFixedString) {
+TEST_CASE_METHOD(TApp, "VectorDefaultedFixedString", "[deprecated]") {
     std::vector<std::string> strvec{"one"};
     std::vector<std::string> answer{"mystring", "mystring2", "mystring3"};
 
     CLI::Option *opt = app.add_option("-s,--string", strvec, "", true)->expected(3);
-    EXPECT_EQ(3, opt->get_expected());
+    CHECK(opt->get_expected() == 3);
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 }
 
-TEST_F(TApp, DefaultedResult) {
+TEST_CASE_METHOD(TApp, "DefaultedResult", "[deprecated]") {
     std::string sval = "NA";
     int ival;
     auto opts = app.add_option("--string", sval, "", true);
     auto optv = app.add_option("--val", ival);
     args = {};
     run();
-    EXPECT_EQ(sval, "NA");
+    CHECK("NA" == sval);
     std::string nString;
     opts->results(nString);
-    EXPECT_EQ(nString, "NA");
+    CHECK("NA" == nString);
     int newIval;
-    // EXPECT_THROW(optv->results(newIval), CLI::ConversionError);
+    // CHECK_THROWS_AS (optv->results(newIval), CLI::ConversionError);
     optv->default_str("442");
     optv->results(newIval);
-    EXPECT_EQ(newIval, 442);
+    CHECK(442 == newIval);
 }
 
-TEST_F(TApp, OptionWithDefaults) {
+TEST_CASE_METHOD(TApp, "OptionWithDefaults", "[deprecated]") {
     int someint = 2;
     app.add_option("-a", someint, "", true);
 
     args = {"-a1", "-a2"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse", "[deprecated]") {
 
     std::vector<int> vals = {1, 2, 3};
     args = {"--idx", "1,2,3"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
     std::vector<int> vals2;
     // check that the results vector gets the results in the same way
     opt->results(vals2);
-    EXPECT_EQ(vals2, vals);
+    CHECK(vals == vals2);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals, "", true)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
 }
 
 // #209
-TEST_F(TApp, CustomUserSepParse2) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse2", "[deprecated]") {
 
     std::vector<int> vals = {1, 2, 3};
     args = {"--idx", "1,2,"};
     auto opt = app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals, "", true)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 //
 // #209
-TEST_F(TApp, CustomUserSepParse4) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse4", "[deprecated]") {
 
     std::vector<int> vals;
     args = {"--idx", "1,    2"};
     auto opt = app.add_option("--idx", vals, "", true)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 
     app.remove_option(opt);
 
     app.add_option("--idx", vals)->delimiter(',');
     run();
-    EXPECT_EQ(vals, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == vals);
 }
 
 // #218
-TEST_F(TApp, CustomUserSepParse5) {
+TEST_CASE_METHOD(TApp, "CustomUserSepParse5", "[deprecated]") {
 
     std::vector<std::string> bar;
     args = {"this", "is", "a", "test"};
     auto opt = app.add_option("bar", bar, "bar");
     run();
-    EXPECT_EQ(bar, std::vector<std::string>({"this", "is", "a", "test"}));
+    CHECK(std::vector<std::string>({"this", "is", "a", "test"}) == bar);
 
     app.remove_option(opt);
     args = {"this", "is", "a", "test"};
     app.add_option("bar", bar, "bar", true);
     run();
-    EXPECT_EQ(bar, std::vector<std::string>({"this", "is", "a", "test"}));
+    CHECK(std::vector<std::string>({"this", "is", "a", "test"}) == bar);
 }
diff --git a/packages/CLI11/tests/FormatterTest.cpp b/packages/CLI11/tests/FormatterTest.cpp
index 513f956a03bc1d20e01b6fb6a05821903a28e6f0..22da56f0e543af444e9caadd0c60a2932fe35018 100644
--- a/packages/CLI11/tests/FormatterTest.cpp
+++ b/packages/CLI11/tests/FormatterTest.cpp
@@ -10,12 +10,10 @@
 #include "CLI/CLI.hpp"
 #endif
 
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
+#include "catch.hpp"
 #include <fstream>
 
-using ::testing::HasSubstr;
-using ::testing::Not;
+using Catch::Matchers::Contains;
 
 class SimpleFormatter : public CLI::FormatterBase {
   public:
@@ -26,17 +24,17 @@ class SimpleFormatter : public CLI::FormatterBase {
     }
 };
 
-TEST(Formatter, Nothing) {
+TEST_CASE("Formatter: Nothing", "[formatter]") {
     CLI::App app{"My prog"};
 
     app.formatter(std::make_shared<SimpleFormatter>());
 
     std::string help = app.help();
 
-    EXPECT_EQ(help, "This is really simple");
+    CHECK("This is really simple" == help);
 }
 
-TEST(Formatter, NothingLambda) {
+TEST_CASE("Formatter: NothingLambda", "[formatter]") {
     CLI::App app{"My prog"};
 
     app.formatter_fn(
@@ -44,10 +42,10 @@ TEST(Formatter, NothingLambda) {
 
     std::string help = app.help();
 
-    EXPECT_EQ(help, "This is really simple");
+    CHECK("This is really simple" == help);
 }
 
-TEST(Formatter, OptCustomize) {
+TEST_CASE("Formatter: OptCustomize", "[formatter]") {
     CLI::App app{"My prog"};
 
     auto optfmt = std::make_shared<CLI::Formatter>();
@@ -60,16 +58,15 @@ TEST(Formatter, OptCustomize) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("(MUST HAVE)"));
-    EXPECT_EQ(help,
-              "My prog\n"
-              "Usage: [OPTIONS]\n\n"
-              "Options:\n"
-              "  -h,--help              Print this help message and exit\n"
-              "  --opt INT (MUST HAVE)  Something\n\n");
+    CHECK_THAT(help, Contains("(MUST HAVE)"));
+    CHECK(help == "My prog\n"
+                  "Usage: [OPTIONS]\n\n"
+                  "Options:\n"
+                  "  -h,--help              Print this help message and exit\n"
+                  "  --opt INT (MUST HAVE)  Something\n\n");
 }
 
-TEST(Formatter, OptCustomizeSimple) {
+TEST_CASE("Formatter: OptCustomizeSimple", "[formatter]") {
     CLI::App app{"My prog"};
 
     app.get_formatter()->column_width(25);
@@ -80,16 +77,15 @@ TEST(Formatter, OptCustomizeSimple) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("(MUST HAVE)"));
-    EXPECT_EQ(help,
-              "My prog\n"
-              "Usage: [OPTIONS]\n\n"
-              "Options:\n"
-              "  -h,--help              Print this help message and exit\n"
-              "  --opt INT (MUST HAVE)  Something\n\n");
+    CHECK_THAT(help, Contains("(MUST HAVE)"));
+    CHECK(help == "My prog\n"
+                  "Usage: [OPTIONS]\n\n"
+                  "Options:\n"
+                  "  -h,--help              Print this help message and exit\n"
+                  "  --opt INT (MUST HAVE)  Something\n\n");
 }
 
-TEST(Formatter, OptCustomizeOptionText) {
+TEST_CASE("Formatter: OptCustomizeOptionText", "[formatter]") {
     CLI::App app{"My prog"};
 
     app.get_formatter()->column_width(25);
@@ -99,16 +95,15 @@ TEST(Formatter, OptCustomizeOptionText) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("(ARG)"));
-    EXPECT_EQ(help,
-              "My prog\n"
-              "Usage: [OPTIONS]\n\n"
-              "Options:\n"
-              "  -h,--help              Print this help message and exit\n"
-              "  --opt (ARG)            Something\n\n");
+    CHECK_THAT(help, Contains("(ARG)"));
+    CHECK(help == "My prog\n"
+                  "Usage: [OPTIONS]\n\n"
+                  "Options:\n"
+                  "  -h,--help              Print this help message and exit\n"
+                  "  --opt (ARG)            Something\n\n");
 }
 
-TEST(Formatter, FalseFlagExample) {
+TEST_CASE("Formatter: FalseFlagExample", "[formatter]") {
     CLI::App app{"My prog"};
 
     app.get_formatter()->column_width(25);
@@ -122,12 +117,12 @@ TEST(Formatter, FalseFlagExample) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("--no_opt{false}"));
-    EXPECT_THAT(help, HasSubstr("--no_opt2{false}"));
-    EXPECT_THAT(help, HasSubstr("-O{false}"));
+    CHECK_THAT(help, Contains("--no_opt{false}"));
+    CHECK_THAT(help, Contains("--no_opt2{false}"));
+    CHECK_THAT(help, Contains("-O{false}"));
 }
 
-TEST(Formatter, AppCustomize) {
+TEST_CASE("Formatter: AppCustomize", "[formatter]") {
     CLI::App app{"My prog"};
     app.add_subcommand("subcom1", "This");
 
@@ -139,17 +134,16 @@ TEST(Formatter, AppCustomize) {
     app.add_subcommand("subcom2", "This");
 
     std::string help = app.help();
-    EXPECT_EQ(help,
-              "My prog\n"
-              "Run: [OPTIONS] [SUBCOMMAND]\n\n"
-              "Options:\n"
-              "  -h,--help         Print this help message and exit\n\n"
-              "Subcommands:\n"
-              "  subcom1           This\n"
-              "  subcom2           This\n\n");
+    CHECK(help == "My prog\n"
+                  "Run: [OPTIONS] [SUBCOMMAND]\n\n"
+                  "Options:\n"
+                  "  -h,--help         Print this help message and exit\n\n"
+                  "Subcommands:\n"
+                  "  subcom1           This\n"
+                  "  subcom2           This\n\n");
 }
 
-TEST(Formatter, AppCustomizeSimple) {
+TEST_CASE("Formatter: AppCustomizeSimple", "[formatter]") {
     CLI::App app{"My prog"};
     app.add_subcommand("subcom1", "This");
 
@@ -159,48 +153,47 @@ TEST(Formatter, AppCustomizeSimple) {
     app.add_subcommand("subcom2", "This");
 
     std::string help = app.help();
-    EXPECT_EQ(help,
-              "My prog\n"
-              "Run: [OPTIONS] [SUBCOMMAND]\n\n"
-              "Options:\n"
-              "  -h,--help         Print this help message and exit\n\n"
-              "Subcommands:\n"
-              "  subcom1           This\n"
-              "  subcom2           This\n\n");
+    CHECK(help == "My prog\n"
+                  "Run: [OPTIONS] [SUBCOMMAND]\n\n"
+                  "Options:\n"
+                  "  -h,--help         Print this help message and exit\n\n"
+                  "Subcommands:\n"
+                  "  subcom1           This\n"
+                  "  subcom2           This\n\n");
 }
 
-TEST(Formatter, AllSub) {
+TEST_CASE("Formatter: AllSub", "[formatter]") {
     CLI::App app{"My prog"};
     CLI::App *sub = app.add_subcommand("subcom", "This");
     sub->add_flag("--insub", "MyFlag");
 
     std::string help = app.help("", CLI::AppFormatMode::All);
-    EXPECT_THAT(help, HasSubstr("--insub"));
-    EXPECT_THAT(help, HasSubstr("subcom"));
+    CHECK_THAT(help, Contains("--insub"));
+    CHECK_THAT(help, Contains("subcom"));
 }
 
-TEST(Formatter, AllSubRequired) {
+TEST_CASE("Formatter: AllSubRequired", "[formatter]") {
     CLI::App app{"My prog"};
     CLI::App *sub = app.add_subcommand("subcom", "This");
     sub->add_flag("--insub", "MyFlag");
     sub->required();
     std::string help = app.help("", CLI::AppFormatMode::All);
-    EXPECT_THAT(help, HasSubstr("--insub"));
-    EXPECT_THAT(help, HasSubstr("subcom"));
-    EXPECT_THAT(help, HasSubstr("REQUIRED"));
+    CHECK_THAT(help, Contains("--insub"));
+    CHECK_THAT(help, Contains("subcom"));
+    CHECK_THAT(help, Contains("REQUIRED"));
 }
 
-TEST(Formatter, NamelessSub) {
+TEST_CASE("Formatter: NamelessSub", "[formatter]") {
     CLI::App app{"My prog"};
     CLI::App *sub = app.add_subcommand("", "This subcommand");
     sub->add_flag("--insub", "MyFlag");
 
     std::string help = app.help("", CLI::AppFormatMode::Normal);
-    EXPECT_THAT(help, HasSubstr("--insub"));
-    EXPECT_THAT(help, HasSubstr("This subcommand"));
+    CHECK_THAT(help, Contains("--insub"));
+    CHECK_THAT(help, Contains("This subcommand"));
 }
 
-TEST(Formatter, NamelessSubInGroup) {
+TEST_CASE("Formatter: NamelessSubInGroup", "[formatter]") {
     CLI::App app{"My prog"};
     CLI::App *sub = app.add_subcommand("", "This subcommand");
     CLI::App *sub2 = app.add_subcommand("sub2", "subcommand2");
@@ -210,9 +203,9 @@ TEST(Formatter, NamelessSubInGroup) {
     sub->group("group1");
     sub2->group("group1");
     std::string help = app.help("", CLI::AppFormatMode::Normal);
-    EXPECT_THAT(help, HasSubstr("--insub"));
-    EXPECT_THAT(help, HasSubstr("This subcommand"));
-    EXPECT_THAT(help, HasSubstr("group1"));
-    EXPECT_THAT(help, HasSubstr("sub2"));
-    EXPECT_TRUE(help.find("pos") == std::string::npos);
+    CHECK_THAT(help, Contains("--insub"));
+    CHECK_THAT(help, Contains("This subcommand"));
+    CHECK_THAT(help, Contains("group1"));
+    CHECK_THAT(help, Contains("sub2"));
+    CHECK(help.find("pos") == std::string::npos);
 }
diff --git a/packages/CLI11/tests/HelpTest.cpp b/packages/CLI11/tests/HelpTest.cpp
index a6169dfebbdcaa907f2163b57610941e6338a810..48089f6fc76ea10a7446447ebf8114fdd9797553 100644
--- a/packages/CLI11/tests/HelpTest.cpp
+++ b/packages/CLI11/tests/HelpTest.cpp
@@ -10,65 +10,63 @@
 #include "CLI/CLI.hpp"
 #endif
 
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
+#include "catch.hpp"
 #include <fstream>
 
-using ::testing::HasSubstr;
-using ::testing::Not;
+using Catch::Matchers::Contains;
 
-TEST(THelp, Basic) {
+TEST_CASE("THelp: Basic", "[help]") {
     CLI::App app{"My prog"};
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 }
 
-TEST(THelp, Footer) {
+TEST_CASE("THelp: Footer", "[help]") {
     CLI::App app{"My prog"};
     app.footer("Report bugs to bugs@example.com");
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
-    EXPECT_THAT(help, HasSubstr("Report bugs to bugs@example.com"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
+    CHECK_THAT(help, Contains("Report bugs to bugs@example.com"));
 }
 
-TEST(THelp, FooterCallback) {
+TEST_CASE("THelp: FooterCallback", "[help]") {
     CLI::App app{"My prog"};
     app.footer([]() { return "Report bugs to bugs@example.com"; });
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
-    EXPECT_THAT(help, HasSubstr("Report bugs to bugs@example.com"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
+    CHECK_THAT(help, Contains("Report bugs to bugs@example.com"));
 }
 
-TEST(THelp, FooterCallbackBoth) {
+TEST_CASE("THelp: FooterCallbackBoth", "[help]") {
     CLI::App app{"My prog"};
     app.footer([]() { return "Report bugs to bugs@example.com"; });
     app.footer(" foot!!!!");
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
-    EXPECT_THAT(help, HasSubstr("Report bugs to bugs@example.com"));
-    EXPECT_THAT(help, HasSubstr("foot!!!!"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
+    CHECK_THAT(help, Contains("Report bugs to bugs@example.com"));
+    CHECK_THAT(help, Contains("foot!!!!"));
 }
 
-TEST(THelp, OptionalPositional) {
+TEST_CASE("THelp: OptionalPositional", "[help]") {
     CLI::App app{"My prog", "program"};
 
     std::string x;
@@ -76,16 +74,16 @@ TEST(THelp, OptionalPositional) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Positionals:"));
-    EXPECT_THAT(help, HasSubstr("something TEXT"));
-    EXPECT_THAT(help, HasSubstr("My option here"));
-    EXPECT_THAT(help, HasSubstr("Usage: program [OPTIONS] [something]"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Positionals:"));
+    CHECK_THAT(help, Contains("something TEXT"));
+    CHECK_THAT(help, Contains("My option here"));
+    CHECK_THAT(help, Contains("Usage: program [OPTIONS] [something]"));
 }
 
-TEST(THelp, Hidden) {
+TEST_CASE("THelp: Hidden", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -95,15 +93,15 @@ TEST(THelp, Hidden) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, Not(HasSubstr("[something]")));
-    EXPECT_THAT(help, Not(HasSubstr("something ")));
-    EXPECT_THAT(help, Not(HasSubstr("another")));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, !Contains("[something]"));
+    CHECK_THAT(help, !Contains("something "));
+    CHECK_THAT(help, !Contains("another"));
 }
 
-TEST(THelp, deprecatedOptions) {
+TEST_CASE("THelp: deprecatedOptions", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -116,12 +114,12 @@ TEST(THelp, deprecatedOptions) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("DEPRECATED"));
-    EXPECT_THAT(help, HasSubstr("something"));
-    EXPECT_NO_THROW(app.parse("--something deprecated"));
+    CHECK_THAT(help, Contains("DEPRECATED"));
+    CHECK_THAT(help, Contains("something"));
+    CHECK_NOTHROW(app.parse("--something deprecated"));
 }
 
-TEST(THelp, deprecatedOptions2) {
+TEST_CASE("THelp: deprecatedOptions2", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -134,12 +132,12 @@ TEST(THelp, deprecatedOptions2) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("DEPRECATED"));
-    EXPECT_THAT(help, HasSubstr("something"));
-    EXPECT_NO_THROW(app.parse("--something deprecated"));
+    CHECK_THAT(help, Contains("DEPRECATED"));
+    CHECK_THAT(help, Contains("something"));
+    CHECK_NOTHROW(app.parse("--something deprecated"));
 }
 
-TEST(THelp, deprecatedOptions3) {
+TEST_CASE("THelp: deprecatedOptions3", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -152,12 +150,12 @@ TEST(THelp, deprecatedOptions3) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("DEPRECATED"));
-    EXPECT_THAT(help, HasSubstr("'--something_else' instead"));
-    EXPECT_NO_THROW(app.parse("--something deprecated"));
+    CHECK_THAT(help, Contains("DEPRECATED"));
+    CHECK_THAT(help, Contains("'--something_else' instead"));
+    CHECK_NOTHROW(app.parse("--something deprecated"));
 }
 
-TEST(THelp, retiredOptions) {
+TEST_CASE("THelp: retiredOptions", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -170,13 +168,13 @@ TEST(THelp, retiredOptions) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("RETIRED"));
-    EXPECT_THAT(help, HasSubstr("something"));
+    CHECK_THAT(help, Contains("RETIRED"));
+    CHECK_THAT(help, Contains("something"));
 
-    EXPECT_NO_THROW(app.parse("--something old"));
+    CHECK_NOTHROW(app.parse("--something old"));
 }
 
-TEST(THelp, retiredOptions2) {
+TEST_CASE("THelp: retiredOptions2", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -188,12 +186,12 @@ TEST(THelp, retiredOptions2) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("RETIRED"));
-    EXPECT_THAT(help, HasSubstr("something"));
-    EXPECT_NO_THROW(app.parse("--something old"));
+    CHECK_THAT(help, Contains("RETIRED"));
+    CHECK_THAT(help, Contains("something"));
+    CHECK_NOTHROW(app.parse("--something old"));
 }
 
-TEST(THelp, retiredOptions3) {
+TEST_CASE("THelp: retiredOptions3", "[help]") {
     CLI::App app{"My prog"};
 
     std::string x;
@@ -206,13 +204,13 @@ TEST(THelp, retiredOptions3) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("RETIRED"));
-    EXPECT_THAT(help, HasSubstr("something"));
+    CHECK_THAT(help, Contains("RETIRED"));
+    CHECK_THAT(help, Contains("something"));
 
-    EXPECT_NO_THROW(app.parse("--something old"));
+    CHECK_NOTHROW(app.parse("--something old"));
 }
 
-TEST(THelp, HiddenGroup) {
+TEST_CASE("THelp: HiddenGroup", "[help]") {
     CLI::App app{"My prog"};
     // empty option group name should be hidden
     auto hgroup = app.add_option_group("");
@@ -223,22 +221,22 @@ TEST(THelp, HiddenGroup) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, Not(HasSubstr("[something]")));
-    EXPECT_THAT(help, Not(HasSubstr("something ")));
-    EXPECT_THAT(help, Not(HasSubstr("another")));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, !Contains("[something]"));
+    CHECK_THAT(help, !Contains("something "));
+    CHECK_THAT(help, !Contains("another"));
 
     hgroup->group("ghidden");
 
     help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("something "));
-    EXPECT_THAT(help, HasSubstr("another"));
+    CHECK_THAT(help, Contains("something "));
+    CHECK_THAT(help, Contains("another"));
 }
 
-TEST(THelp, OptionalPositionalAndOptions) {
+TEST_CASE("THelp: OptionalPositionalAndOptions", "[help]") {
     CLI::App app{"My prog", "AnotherProgram"};
     app.add_flag("-q,--quick");
 
@@ -247,13 +245,13 @@ TEST(THelp, OptionalPositionalAndOptions) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage: AnotherProgram [OPTIONS] [something]"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage: AnotherProgram [OPTIONS] [something]"));
 }
 
-TEST(THelp, RequiredPositionalAndOptions) {
+TEST_CASE("THelp: RequiredPositionalAndOptions", "[help]") {
     CLI::App app{"My prog"};
     app.add_flag("-q,--quick");
 
@@ -262,14 +260,14 @@ TEST(THelp, RequiredPositionalAndOptions) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("-h,--help"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Positionals:"));
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS] something"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("-h,--help"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Positionals:"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS] something"));
 }
 
-TEST(THelp, MultiOpts) {
+TEST_CASE("THelp: MultiOpts", "[help]") {
     CLI::App app{"My prog"};
     std::vector<int> x, y;
     app.add_option("-q,--quick", x, "Disc")->expected(2);
@@ -277,24 +275,24 @@ TEST(THelp, MultiOpts) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("Positionals:")));
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS]"));
-    EXPECT_THAT(help, HasSubstr("INT x 2"));
-    EXPECT_THAT(help, HasSubstr("INT ..."));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("Positionals:"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS]"));
+    CHECK_THAT(help, Contains("INT x 2"));
+    CHECK_THAT(help, Contains("INT ..."));
 }
 
-TEST(THelp, VectorOpts) {
+TEST_CASE("THelp: VectorOpts", "[help]") {
     CLI::App app{"My prog"};
     std::vector<int> x = {1, 2};
     app.add_option("-q,--quick", x)->capture_default_str();
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("INT=[1,2] ..."));
+    CHECK_THAT(help, Contains("INT=[1,2] ..."));
 }
 
-TEST(THelp, MultiPosOpts) {
+TEST_CASE("THelp: MultiPosOpts", "[help]") {
     CLI::App app{"My prog"};
     app.name("program");
     std::vector<int> x, y;
@@ -303,26 +301,26 @@ TEST(THelp, MultiPosOpts) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, HasSubstr("Positionals:"));
-    EXPECT_THAT(help, HasSubstr("Usage: program [OPTIONS]"));
-    EXPECT_THAT(help, HasSubstr("INT x 2"));
-    EXPECT_THAT(help, HasSubstr("INT ..."));
-    EXPECT_THAT(help, HasSubstr("[quick(2x)]"));
-    EXPECT_THAT(help, HasSubstr("[vals...]"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, Contains("Positionals:"));
+    CHECK_THAT(help, Contains("Usage: program [OPTIONS]"));
+    CHECK_THAT(help, Contains("INT x 2"));
+    CHECK_THAT(help, Contains("INT ..."));
+    CHECK_THAT(help, Contains("[quick(2x)]"));
+    CHECK_THAT(help, Contains("[vals...]"));
 }
 
-TEST(THelp, EnvName) {
+TEST_CASE("THelp: EnvName", "[help]") {
     CLI::App app{"My prog"};
     std::string input;
     app.add_option("--something", input)->envname("SOME_ENV");
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("SOME_ENV"));
+    CHECK_THAT(help, Contains("SOME_ENV"));
 }
 
-TEST(THelp, Needs) {
+TEST_CASE("THelp: Needs", "[help]") {
     CLI::App app{"My prog"};
 
     CLI::Option *op1 = app.add_flag("--op1");
@@ -330,10 +328,10 @@ TEST(THelp, Needs) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("Needs: --op1"));
+    CHECK_THAT(help, Contains("Needs: --op1"));
 }
 
-TEST(THelp, NeedsPositional) {
+TEST_CASE("THelp: NeedsPositional", "[help]") {
     CLI::App app{"My prog"};
 
     int x{0}, y{0};
@@ -343,11 +341,11 @@ TEST(THelp, NeedsPositional) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("Positionals:"));
-    EXPECT_THAT(help, HasSubstr("Needs: op1"));
+    CHECK_THAT(help, Contains("Positionals:"));
+    CHECK_THAT(help, Contains("Needs: op1"));
 }
 
-TEST(THelp, Excludes) {
+TEST_CASE("THelp: Excludes", "[help]") {
     CLI::App app{"My prog"};
 
     CLI::Option *op1 = app.add_flag("--op1");
@@ -355,10 +353,10 @@ TEST(THelp, Excludes) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("Excludes: --op1"));
+    CHECK_THAT(help, Contains("Excludes: --op1"));
 }
 
-TEST(THelp, ExcludesPositional) {
+TEST_CASE("THelp: ExcludesPositional", "[help]") {
     CLI::App app{"My prog"};
 
     int x{0}, y{0};
@@ -368,11 +366,11 @@ TEST(THelp, ExcludesPositional) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("Positionals:"));
-    EXPECT_THAT(help, HasSubstr("Excludes: op1"));
+    CHECK_THAT(help, Contains("Positionals:"));
+    CHECK_THAT(help, Contains("Excludes: op1"));
 }
 
-TEST(THelp, ExcludesSymmetric) {
+TEST_CASE("THelp: ExcludesSymmetric", "[help]") {
     CLI::App app{"My prog"};
 
     CLI::Option *op1 = app.add_flag("--op1");
@@ -380,10 +378,10 @@ TEST(THelp, ExcludesSymmetric) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("Excludes: --op2"));
+    CHECK_THAT(help, Contains("Excludes: --op2"));
 }
 
-TEST(THelp, ManualSetters) {
+TEST_CASE("THelp: ManualSetters", "[help]") {
 
     CLI::App app{"My prog"};
 
@@ -392,35 +390,35 @@ TEST(THelp, ManualSetters) {
     CLI::Option *op1 = app.add_option("--op", x);
     op1->default_str("12");
     op1->type_name("BIGGLES");
-    EXPECT_EQ(x, 1);
+    CHECK(1 == x);
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("=12"));
-    EXPECT_THAT(help, HasSubstr("BIGGLES"));
+    CHECK_THAT(help, Contains("=12"));
+    CHECK_THAT(help, Contains("BIGGLES"));
 
     op1->default_val("14");
-    EXPECT_EQ(x, 14);
+    CHECK(14 == x);
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("=14"));
+    CHECK_THAT(help, Contains("=14"));
 
     op1->default_val(12);
-    EXPECT_EQ(x, 12);
+    CHECK(12 == x);
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("=12"));
+    CHECK_THAT(help, Contains("=12"));
 
-    EXPECT_TRUE(op1->get_run_callback_for_default());
+    CHECK(op1->get_run_callback_for_default());
     op1->run_callback_for_default(false);
-    EXPECT_FALSE(op1->get_run_callback_for_default());
+    CHECK(!op1->get_run_callback_for_default());
 
     op1->default_val(18);
     // x should not be modified in this case
-    EXPECT_EQ(x, 12);
+    CHECK(12 == x);
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("=18"));
+    CHECK_THAT(help, Contains("=18"));
 }
 
-TEST(THelp, ManualSetterOverFunction) {
+TEST_CASE("THelp: ManualSetterOverFunction", "[help]") {
 
     CLI::App app{"My prog"};
 
@@ -431,31 +429,31 @@ TEST(THelp, ManualSetterOverFunction) {
     op1->default_str("12");
     op1->type_name("BIGGLES");
     op2->type_name("QUIGGLES");
-    EXPECT_EQ(x, 1);
+    CHECK(1 == x);
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("=12"));
-    EXPECT_THAT(help, HasSubstr("BIGGLES"));
-    EXPECT_THAT(help, HasSubstr("QUIGGLES"));
-    EXPECT_THAT(help, HasSubstr("{1,2}"));
+    CHECK_THAT(help, Contains("=12"));
+    CHECK_THAT(help, Contains("BIGGLES"));
+    CHECK_THAT(help, Contains("QUIGGLES"));
+    CHECK_THAT(help, Contains("{1,2}"));
 }
 
-TEST(THelp, Subcom) {
+TEST_CASE("THelp: Subcom", "[help]") {
     CLI::App app{"My prog"};
 
     auto sub1 = app.add_subcommand("sub1");
     app.add_subcommand("sub2");
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS] [SUBCOMMAND]"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS] [SUBCOMMAND]"));
 
     app.require_subcommand();
 
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS] SUBCOMMAND"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS] SUBCOMMAND"));
 
     help = sub1->help();
-    EXPECT_THAT(help, HasSubstr("Usage: sub1"));
+    CHECK_THAT(help, Contains("Usage: sub1"));
 
     char x[] = "./myprogram";
     char y[] = "sub2";
@@ -464,10 +462,10 @@ TEST(THelp, Subcom) {
     app.parse(static_cast<int>(args.size()), args.data());
 
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("Usage: ./myprogram sub2"));
+    CHECK_THAT(help, Contains("Usage: ./myprogram sub2"));
 }
 
-TEST(THelp, Subcom_alias) {
+TEST_CASE("THelp: Subcom_alias", "[help]") {
     CLI::App app{"My prog"};
 
     auto sub1 = app.add_subcommand("sub1", "Subcommand1 description test");
@@ -477,12 +475,12 @@ TEST(THelp, Subcom_alias) {
     app.add_subcommand("sub2", "Subcommand2 description test");
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS] [SUBCOMMAND]"));
-    EXPECT_THAT(help, HasSubstr("sub_alias1"));
-    EXPECT_THAT(help, HasSubstr("sub_alias2"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS] [SUBCOMMAND]"));
+    CHECK_THAT(help, Contains("sub_alias1"));
+    CHECK_THAT(help, Contains("sub_alias2"));
 }
 
-TEST(THelp, Subcom_alias_group) {
+TEST_CASE("THelp: Subcom_alias_group", "[help]") {
     CLI::App app{"My prog"};
 
     auto sub1 = app.add_subcommand("", "Subcommand1 description test");
@@ -492,12 +490,12 @@ TEST(THelp, Subcom_alias_group) {
     app.add_subcommand("sub2", "Subcommand2 description test");
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("Usage: [OPTIONS] [SUBCOMMAND]"));
-    EXPECT_THAT(help, HasSubstr("sub_alias1"));
-    EXPECT_THAT(help, HasSubstr("sub_alias2"));
+    CHECK_THAT(help, Contains("Usage: [OPTIONS] [SUBCOMMAND]"));
+    CHECK_THAT(help, Contains("sub_alias1"));
+    CHECK_THAT(help, Contains("sub_alias2"));
 }
 
-TEST(THelp, MasterName) {
+TEST_CASE("THelp: MasterName", "[help]") {
     CLI::App app{"My prog", "MyRealName"};
 
     char x[] = "./myprogram";
@@ -505,10 +503,10 @@ TEST(THelp, MasterName) {
     std::vector<char *> args = {x};
     app.parse(static_cast<int>(args.size()), args.data());
 
-    EXPECT_THAT(app.help(), HasSubstr("Usage: MyRealName"));
+    CHECK_THAT(app.help(), Contains("Usage: MyRealName"));
 }
 
-TEST(THelp, IntDefaults) {
+TEST_CASE("THelp: IntDefaults", "[help]") {
     CLI::App app{"My prog"};
 
     int one{1}, two{2};
@@ -517,14 +515,14 @@ TEST(THelp, IntDefaults) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("--one"));
-    EXPECT_THAT(help, HasSubstr("--set"));
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("=2"));
-    EXPECT_THAT(help, HasSubstr("2,3,4"));
+    CHECK_THAT(help, Contains("--one"));
+    CHECK_THAT(help, Contains("--set"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("=2"));
+    CHECK_THAT(help, Contains("2,3,4"));
 }
 
-TEST(THelp, SetLower) {
+TEST_CASE("THelp: SetLower", "[help]") {
     CLI::App app{"My prog"};
     app.option_defaults()->always_capture_default();
 
@@ -533,14 +531,14 @@ TEST(THelp, SetLower) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("--set"));
-    EXPECT_THAT(help, HasSubstr("=One"));
-    EXPECT_THAT(help, HasSubstr("oNe"));
-    EXPECT_THAT(help, HasSubstr("twO"));
-    EXPECT_THAT(help, HasSubstr("THREE"));
+    CHECK_THAT(help, Contains("--set"));
+    CHECK_THAT(help, Contains("=One"));
+    CHECK_THAT(help, Contains("oNe"));
+    CHECK_THAT(help, Contains("twO"));
+    CHECK_THAT(help, Contains("THREE"));
 }
 
-TEST(THelp, OnlyOneHelp) {
+TEST_CASE("THelp: OnlyOneHelp", "[help]") {
     CLI::App app{"My prog"};
 
     // It is not supported to have more than one help flag, last one wins
@@ -548,10 +546,10 @@ TEST(THelp, OnlyOneHelp) {
     app.set_help_flag("--yelp", "Alias for help");
 
     std::vector<std::string> input{"--help"};
-    EXPECT_THROW(app.parse(input), CLI::ExtrasError);
+    CHECK_THROWS_AS(app.parse(input), CLI::ExtrasError);
 }
 
-TEST(THelp, MultiHelp) {
+TEST_CASE("THelp: MultiHelp", "[help]") {
     CLI::App app{"My prog"};
 
     // It is not supported to have more than one help flag, last one wins
@@ -559,10 +557,10 @@ TEST(THelp, MultiHelp) {
     app.allow_windows_style_options();
 
     std::vector<std::string> input{"/?"};
-    EXPECT_THROW(app.parse(input), CLI::CallForHelp);
+    CHECK_THROWS_AS(app.parse(input), CLI::CallForHelp);
 }
 
-TEST(THelp, OnlyOneAllHelp) {
+TEST_CASE("THelp: OnlyOneAllHelp", "[help]") {
     CLI::App app{"My prog"};
 
     // It is not supported to have more than one help flag, last one wins
@@ -570,37 +568,37 @@ TEST(THelp, OnlyOneAllHelp) {
     app.set_help_all_flag("--yelp", "Alias for help");
 
     std::vector<std::string> input{"--help-all"};
-    EXPECT_THROW(app.parse(input), CLI::ExtrasError);
+    CHECK_THROWS_AS(app.parse(input), CLI::ExtrasError);
 
     std::vector<std::string> input2{"--yelp"};
-    EXPECT_THROW(app.parse(input2), CLI::CallForAllHelp);
+    CHECK_THROWS_AS(app.parse(input2), CLI::CallForAllHelp);
 
     // Remove the flag
     app.set_help_all_flag();
     std::vector<std::string> input3{"--yelp"};
-    EXPECT_THROW(app.parse(input3), CLI::ExtrasError);
+    CHECK_THROWS_AS(app.parse(input3), CLI::ExtrasError);
 }
 
-TEST(THelp, RemoveHelp) {
+TEST_CASE("THelp: RemoveHelp", "[help]") {
     CLI::App app{"My prog"};
     app.set_help_flag();
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("-h,--help")));
-    EXPECT_THAT(help, Not(HasSubstr("Options:")));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("-h,--help"));
+    CHECK_THAT(help, !Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 
     std::vector<std::string> input{"--help"};
     try {
         app.parse(input);
     } catch(const CLI::ParseError &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::ExtrasError), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::ExtrasError));
     }
 }
 
-TEST(THelp, RemoveOtherMethodHelp) {
+TEST_CASE("THelp: RemoveOtherMethodHelp", "[help]") {
     CLI::App app{"My prog"};
 
     // Don't do this. Just in case, let's make sure it works.
@@ -608,20 +606,20 @@ TEST(THelp, RemoveOtherMethodHelp) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("-h,--help")));
-    EXPECT_THAT(help, Not(HasSubstr("Options:")));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("-h,--help"));
+    CHECK_THAT(help, !Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 
     std::vector<std::string> input{"--help"};
     try {
         app.parse(input);
     } catch(const CLI::ParseError &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::ExtrasError), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::ExtrasError));
     }
 }
 
-TEST(THelp, RemoveOtherMethodHelpAll) {
+TEST_CASE("THelp: RemoveOtherMethodHelpAll", "[help]") {
     CLI::App app{"My prog"};
 
     app.set_help_all_flag("--help-all");
@@ -630,61 +628,61 @@ TEST(THelp, RemoveOtherMethodHelpAll) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("--help-all")));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("--help-all"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 
     std::vector<std::string> input{"--help-all"};
     try {
         app.parse(input);
     } catch(const CLI::ParseError &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::ExtrasError), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::ExtrasError));
     }
 }
 
-TEST(THelp, NoHelp) {
+TEST_CASE("THelp: NoHelp", "[help]") {
     CLI::App app{"My prog"};
     app.set_help_flag();
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("-h,--help")));
-    EXPECT_THAT(help, Not(HasSubstr("Options:")));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("-h,--help"));
+    CHECK_THAT(help, !Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 
     std::vector<std::string> input{"--help"};
     try {
         app.parse(input);
     } catch(const CLI::ParseError &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::ExtrasError), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::ExtrasError));
     }
 }
 
-TEST(THelp, CustomHelp) {
+TEST_CASE("THelp: CustomHelp", "[help]") {
     CLI::App app{"My prog"};
 
     CLI::Option *help_option = app.set_help_flag("--yelp", "display help and exit");
-    EXPECT_EQ(app.get_help_ptr(), help_option);
+    CHECK(help_option == app.get_help_ptr());
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("My prog"));
-    EXPECT_THAT(help, Not(HasSubstr("-h,--help")));
-    EXPECT_THAT(help, HasSubstr("--yelp"));
-    EXPECT_THAT(help, HasSubstr("Options:"));
-    EXPECT_THAT(help, HasSubstr("Usage:"));
+    CHECK_THAT(help, Contains("My prog"));
+    CHECK_THAT(help, !Contains("-h,--help"));
+    CHECK_THAT(help, Contains("--yelp"));
+    CHECK_THAT(help, Contains("Options:"));
+    CHECK_THAT(help, Contains("Usage:"));
 
     std::vector<std::string> input{"--yelp"};
     try {
         app.parse(input);
     } catch(const CLI::CallForHelp &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::Success), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::Success));
     }
 }
 
-TEST(THelp, NextLineShouldBeAlignmentInMultilineDescription) {
+TEST_CASE("THelp: NextLineShouldBeAlignmentInMultilineDescription", "[help]") {
     CLI::App app;
     int i{0};
     const std::string first{"first line"};
@@ -693,10 +691,10 @@ TEST(THelp, NextLineShouldBeAlignmentInMultilineDescription) {
 
     const std::string help = app.help();
     const auto width = app.get_formatter()->get_column_width();
-    EXPECT_THAT(help, HasSubstr(first + "\n" + std::string(width, ' ') + second));
+    CHECK_THAT(help, Contains(first + "\n" + std::string(width, ' ') + second));
 }
 
-TEST(THelp, NiceName) {
+TEST_CASE("THelp: NiceName", "[help]") {
     CLI::App app;
 
     int x{0};
@@ -704,23 +702,23 @@ TEST(THelp, NiceName) {
     auto short_name = app.add_option("more,-x,-y", x);
     auto positional = app.add_option("posit", x);
 
-    EXPECT_EQ(long_name->get_name(), "--long");
-    EXPECT_EQ(short_name->get_name(), "-x");
-    EXPECT_EQ(positional->get_name(), "posit");
+    CHECK("--long" == long_name->get_name());
+    CHECK("-x" == short_name->get_name());
+    CHECK("posit" == positional->get_name());
 }
 
-TEST(Exit, ErrorWithHelp) {
+TEST_CASE("Exit: ErrorWithHelp", "[help]") {
     CLI::App app{"My prog"};
 
     std::vector<std::string> input{"-h"};
     try {
         app.parse(input);
     } catch(const CLI::CallForHelp &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::Success), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::Success));
     }
 }
 
-TEST(Exit, ErrorWithAllHelp) {
+TEST_CASE("Exit: ErrorWithAllHelp", "[help]") {
     CLI::App app{"My prog"};
     app.set_help_all_flag("--help-all", "All help");
 
@@ -728,33 +726,33 @@ TEST(Exit, ErrorWithAllHelp) {
     try {
         app.parse(input);
     } catch(const CLI::CallForAllHelp &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::Success), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::Success));
     }
 }
 
-TEST(Exit, ErrorWithoutHelp) {
+TEST_CASE("Exit: ErrorWithoutHelp", "[help]") {
     CLI::App app{"My prog"};
 
     std::vector<std::string> input{"--none"};
     try {
         app.parse(input);
     } catch(const CLI::ParseError &e) {
-        EXPECT_EQ(static_cast<int>(CLI::ExitCodes::ExtrasError), e.get_exit_code());
+        CHECK(e.get_exit_code() == static_cast<int>(CLI::ExitCodes::ExtrasError));
     }
 }
 
-TEST(Exit, ExitCodes) {
+TEST_CASE("Exit: ExitCodes", "[help]") {
     CLI::App app;
 
     auto i = static_cast<int>(CLI::ExitCodes::ExtrasError);
-    EXPECT_EQ(0, app.exit(CLI::Success()));
-    EXPECT_EQ(0, app.exit(CLI::CallForHelp()));
-    EXPECT_EQ(i, app.exit(CLI::ExtrasError({"Thing"})));
-    EXPECT_EQ(42, app.exit(CLI::RuntimeError(42)));
-    EXPECT_EQ(1, app.exit(CLI::RuntimeError()));  // Not sure if a default here is a good thing
+    CHECK(app.exit(CLI::Success()) == 0);
+    CHECK(app.exit(CLI::CallForHelp()) == 0);
+    CHECK(app.exit(CLI::ExtrasError({"Thing"})) == i);
+    CHECK(app.exit(CLI::RuntimeError(42)) == 42);
+    CHECK(app.exit(CLI::RuntimeError()) == 1);
 }
 
-struct CapturedHelp : public ::testing::Test {
+struct CapturedHelp {
     CLI::App app{"My Test Program"};
     std::stringstream out{};
     std::stringstream err{};
@@ -767,113 +765,112 @@ struct CapturedHelp : public ::testing::Test {
     }
 };
 
-TEST_F(CapturedHelp, Successful) {
-    EXPECT_EQ(run(CLI::Success()), 0);
-    EXPECT_EQ(out.str(), "");
-    EXPECT_EQ(err.str(), "");
+TEST_CASE_METHOD(CapturedHelp, "Successful", "[help]") {
+    CHECK(0 == run(CLI::Success()));
+    CHECK("" == out.str());
+    CHECK("" == err.str());
 }
 
-TEST_F(CapturedHelp, JustAnError) {
-    EXPECT_EQ(run(CLI::RuntimeError(42)), 42);
-    EXPECT_EQ(out.str(), "");
-    EXPECT_EQ(err.str(), "");
+TEST_CASE_METHOD(CapturedHelp, "JustAnError", "[help]") {
+    CHECK(42 == run(CLI::RuntimeError(42)));
+    CHECK("" == out.str());
+    CHECK("" == err.str());
 }
 
-TEST_F(CapturedHelp, CallForHelp) {
-    EXPECT_EQ(run(CLI::CallForHelp()), 0);
-    EXPECT_EQ(out.str(), app.help());
-    EXPECT_EQ(err.str(), "");
+TEST_CASE_METHOD(CapturedHelp, "CallForHelp", "[help]") {
+    CHECK(0 == run(CLI::CallForHelp()));
+    CHECK(app.help() == out.str());
+    CHECK("" == err.str());
 }
-TEST_F(CapturedHelp, CallForAllHelp) {
-    EXPECT_EQ(run(CLI::CallForAllHelp()), 0);
-    EXPECT_EQ(out.str(), app.help("", CLI::AppFormatMode::All));
-    EXPECT_EQ(err.str(), "");
+TEST_CASE_METHOD(CapturedHelp, "CallForAllHelp", "[help]") {
+    CHECK(0 == run(CLI::CallForAllHelp()));
+    CHECK(app.help("", CLI::AppFormatMode::All) == out.str());
+    CHECK("" == err.str());
 }
-TEST_F(CapturedHelp, CallForAllHelpOutput) {
+TEST_CASE_METHOD(CapturedHelp, "CallForAllHelpOutput", "[help]") {
     app.set_help_all_flag("--help-all", "Help all");
     app.add_subcommand("one", "One description");
     CLI::App *sub = app.add_subcommand("two");
     sub->add_flag("--three");
 
-    EXPECT_EQ(run(CLI::CallForAllHelp()), 0);
-    EXPECT_EQ(out.str(), app.help("", CLI::AppFormatMode::All));
-    EXPECT_EQ(err.str(), "");
-    EXPECT_THAT(out.str(), HasSubstr("one"));
-    EXPECT_THAT(out.str(), HasSubstr("two"));
-    EXPECT_THAT(out.str(), HasSubstr("--three"));
-
-    EXPECT_EQ(out.str(),
-              "My Test Program\n"
-              "Usage: [OPTIONS] [SUBCOMMAND]\n"
-              "\n"
-              "Options:\n"
-              "  -h,--help                   Print this help message and exit\n"
-              "  --help-all                  Help all\n"
-              "\n"
-              "Subcommands:\n"
-              "one\n"
-              "  One description\n\n"
-              "two\n"
-              "  Options:\n"
-              "    --three                     \n\n\n");
-}
-TEST_F(CapturedHelp, NewFormattedHelp) {
+    CHECK(0 == run(CLI::CallForAllHelp()));
+    CHECK(app.help("", CLI::AppFormatMode::All) == out.str());
+    CHECK("" == err.str());
+    CHECK_THAT(out.str(), Contains("one"));
+    CHECK_THAT(out.str(), Contains("two"));
+    CHECK_THAT(out.str(), Contains("--three"));
+
+    CHECK(out.str() == "My Test Program\n"
+                       "Usage: [OPTIONS] [SUBCOMMAND]\n"
+                       "\n"
+                       "Options:\n"
+                       "  -h,--help                   Print this help message and exit\n"
+                       "  --help-all                  Help all\n"
+                       "\n"
+                       "Subcommands:\n"
+                       "one\n"
+                       "  One description\n\n"
+                       "two\n"
+                       "  Options:\n"
+                       "    --three                     \n\n\n");
+}
+TEST_CASE_METHOD(CapturedHelp, "NewFormattedHelp", "[help]") {
     app.formatter_fn([](const CLI::App *, std::string, CLI::AppFormatMode) { return "New Help"; });
-    EXPECT_EQ(run(CLI::CallForHelp()), 0);
-    EXPECT_EQ(out.str(), "New Help");
-    EXPECT_EQ(err.str(), "");
+    CHECK(0 == run(CLI::CallForHelp()));
+    CHECK("New Help" == out.str());
+    CHECK("" == err.str());
 }
 
-TEST_F(CapturedHelp, NormalError) {
-    EXPECT_EQ(run(CLI::ExtrasError({"Thing"})), static_cast<int>(CLI::ExitCodes::ExtrasError));
-    EXPECT_EQ(out.str(), "");
-    EXPECT_THAT(err.str(), HasSubstr("for more information"));
-    EXPECT_THAT(err.str(), Not(HasSubstr("ExtrasError")));
-    EXPECT_THAT(err.str(), HasSubstr("Thing"));
-    EXPECT_THAT(err.str(), Not(HasSubstr(" or ")));
-    EXPECT_THAT(err.str(), Not(HasSubstr("Usage")));
+TEST_CASE_METHOD(CapturedHelp, "NormalError", "[help]") {
+    CHECK(static_cast<int>(CLI::ExitCodes::ExtrasError) == run(CLI::ExtrasError({"Thing"})));
+    CHECK("" == out.str());
+    CHECK_THAT(err.str(), Contains("for more information"));
+    CHECK_THAT(err.str(), !Contains("ExtrasError"));
+    CHECK_THAT(err.str(), Contains("Thing"));
+    CHECK_THAT(err.str(), !Contains(" or "));
+    CHECK_THAT(err.str(), !Contains("Usage"));
 }
 
-TEST_F(CapturedHelp, DoubleError) {
+TEST_CASE_METHOD(CapturedHelp, "DoubleError", "[help]") {
     app.set_help_all_flag("--help-all");
-    EXPECT_EQ(run(CLI::ExtrasError({"Thing"})), static_cast<int>(CLI::ExitCodes::ExtrasError));
-    EXPECT_EQ(out.str(), "");
-    EXPECT_THAT(err.str(), HasSubstr("for more information"));
-    EXPECT_THAT(err.str(), HasSubstr(" --help "));
-    EXPECT_THAT(err.str(), HasSubstr(" --help-all "));
-    EXPECT_THAT(err.str(), HasSubstr(" or "));
-    EXPECT_THAT(err.str(), Not(HasSubstr("ExtrasError")));
-    EXPECT_THAT(err.str(), HasSubstr("Thing"));
-    EXPECT_THAT(err.str(), Not(HasSubstr("Usage")));
-}
-
-TEST_F(CapturedHelp, AllOnlyError) {
+    CHECK(static_cast<int>(CLI::ExitCodes::ExtrasError) == run(CLI::ExtrasError({"Thing"})));
+    CHECK("" == out.str());
+    CHECK_THAT(err.str(), Contains("for more information"));
+    CHECK_THAT(err.str(), Contains(" --help "));
+    CHECK_THAT(err.str(), Contains(" --help-all "));
+    CHECK_THAT(err.str(), Contains(" or "));
+    CHECK_THAT(err.str(), !Contains("ExtrasError"));
+    CHECK_THAT(err.str(), Contains("Thing"));
+    CHECK_THAT(err.str(), !Contains("Usage"));
+}
+
+TEST_CASE_METHOD(CapturedHelp, "AllOnlyError", "[help]") {
     app.set_help_all_flag("--help-all");
     app.set_help_flag();
-    EXPECT_EQ(run(CLI::ExtrasError({"Thing"})), static_cast<int>(CLI::ExitCodes::ExtrasError));
-    EXPECT_EQ(out.str(), "");
-    EXPECT_THAT(err.str(), HasSubstr("for more information"));
-    EXPECT_THAT(err.str(), Not(HasSubstr(" --help ")));
-    EXPECT_THAT(err.str(), HasSubstr(" --help-all "));
-    EXPECT_THAT(err.str(), Not(HasSubstr(" or ")));
-    EXPECT_THAT(err.str(), Not(HasSubstr("ExtrasError")));
-    EXPECT_THAT(err.str(), HasSubstr("Thing"));
-    EXPECT_THAT(err.str(), Not(HasSubstr("Usage")));
-}
-
-TEST_F(CapturedHelp, ReplacedError) {
+    CHECK(static_cast<int>(CLI::ExitCodes::ExtrasError) == run(CLI::ExtrasError({"Thing"})));
+    CHECK("" == out.str());
+    CHECK_THAT(err.str(), Contains("for more information"));
+    CHECK_THAT(err.str(), !Contains(" --help "));
+    CHECK_THAT(err.str(), Contains(" --help-all "));
+    CHECK_THAT(err.str(), !Contains(" or "));
+    CHECK_THAT(err.str(), !Contains("ExtrasError"));
+    CHECK_THAT(err.str(), Contains("Thing"));
+    CHECK_THAT(err.str(), !Contains("Usage"));
+}
+
+TEST_CASE_METHOD(CapturedHelp, "ReplacedError", "[help]") {
     app.failure_message(CLI::FailureMessage::help);
 
-    EXPECT_EQ(run(CLI::ExtrasError({"Thing"})), static_cast<int>(CLI::ExitCodes::ExtrasError));
-    EXPECT_EQ(out.str(), "");
-    EXPECT_THAT(err.str(), Not(HasSubstr("for more information")));
-    EXPECT_THAT(err.str(), HasSubstr("ERROR: ExtrasError"));
-    EXPECT_THAT(err.str(), HasSubstr("Thing"));
-    EXPECT_THAT(err.str(), HasSubstr("Usage"));
+    CHECK(static_cast<int>(CLI::ExitCodes::ExtrasError) == run(CLI::ExtrasError({"Thing"})));
+    CHECK("" == out.str());
+    CHECK_THAT(err.str(), !Contains("for more information"));
+    CHECK_THAT(err.str(), Contains("ERROR: ExtrasError"));
+    CHECK_THAT(err.str(), Contains("Thing"));
+    CHECK_THAT(err.str(), Contains("Usage"));
 }
 
 // #87
-TEST(THelp, CustomDoubleOption) {
+TEST_CASE("THelp: CustomDoubleOption", "[help]") {
 
     std::pair<int, double> custom_opt;
 
@@ -885,74 +882,74 @@ TEST(THelp, CustomDoubleOption) {
     });
     opt->type_name("INT FLOAT")->type_size(2);
 
-    EXPECT_THAT(app.help(), Not(HasSubstr("x 2")));
+    CHECK_THAT(app.help(), !Contains("x 2"));
 }
 
-TEST(THelp, CheckEmptyTypeName) {
+TEST_CASE("THelp: CheckEmptyTypeName", "[help]") {
     CLI::App app;
 
     auto opt = app.add_flag("-f,--flag");
     std::string name = opt->get_type_name();
-    EXPECT_TRUE(name.empty());
+    CHECK(name.empty());
 }
 
-TEST(THelp, AccessDescription) {
+TEST_CASE("THelp: AccessDescription", "[help]") {
     CLI::App app{"My description goes here"};
 
-    EXPECT_EQ(app.get_description(), "My description goes here");
+    CHECK("My description goes here" == app.get_description());
 }
 
-TEST(THelp, SetDescriptionAfterCreation) {
+TEST_CASE("THelp: SetDescriptionAfterCreation", "[help]") {
     CLI::App app{""};
 
     app.description("My description goes here");
 
-    EXPECT_EQ(app.get_description(), "My description goes here");
-    EXPECT_THAT(app.help(), HasSubstr("My description goes here"));
+    CHECK("My description goes here" == app.get_description());
+    CHECK_THAT(app.help(), Contains("My description goes here"));
 }
 
-TEST(THelp, AccessOptionDescription) {
+TEST_CASE("THelp: AccessOptionDescription", "[help]") {
     CLI::App app{};
 
     int x{0};
     auto opt = app.add_option("-a,--alpha", x, "My description goes here");
 
-    EXPECT_EQ(opt->get_description(), "My description goes here");
+    CHECK("My description goes here" == opt->get_description());
 }
 
-TEST(THelp, SetOptionDescriptionAfterCreation) {
+TEST_CASE("THelp: SetOptionDescriptionAfterCreation", "[help]") {
     CLI::App app{};
 
     int x{0};
     auto opt = app.add_option("-a,--alpha", x);
     opt->description("My description goes here");
 
-    EXPECT_EQ(opt->get_description(), "My description goes here");
-    EXPECT_THAT(app.help(), HasSubstr("My description goes here"));
+    CHECK("My description goes here" == opt->get_description());
+    CHECK_THAT(app.help(), Contains("My description goes here"));
 }
 
-TEST(THelp, CleanNeeds) {
+TEST_CASE("THelp: CleanNeeds", "[help]") {
     CLI::App app;
 
     int x{0};
     auto a_name = app.add_option("-a,--alpha", x);
     app.add_option("-b,--boo", x)->needs(a_name);
 
-    EXPECT_THAT(app.help(), Not(HasSubstr("Requires")));
-    EXPECT_THAT(app.help(), Not(HasSubstr("Needs: -a,--alpha")));
-    EXPECT_THAT(app.help(), HasSubstr("Needs: --alpha"));
+    CHECK_THAT(app.help(), !Contains("Requires"));
+    CHECK_THAT(app.help(), !Contains("Needs: -a,--alpha"));
+    CHECK_THAT(app.help(), Contains("Needs: --alpha"));
 }
 
-TEST(THelp, RequiredPrintout) {
+TEST_CASE("THelp: RequiredPrintout", "[help]") {
     CLI::App app;
 
     int x{0};
     app.add_option("-a,--alpha", x)->required();
 
-    EXPECT_THAT(app.help(), HasSubstr(" REQUIRED"));
+    CHECK_THAT(app.help(), Contains(" REQUIRED"));
 }
 
-TEST(THelp, GroupOrder) {
+TEST_CASE("THelp: GroupOrder", "[help]") {
     CLI::App app;
 
     app.add_flag("--one")->group("zee");
@@ -963,12 +960,12 @@ TEST(THelp, GroupOrder) {
     auto zee_loc = help.find("zee");
     auto aee_loc = help.find("aee");
 
-    EXPECT_NE(zee_loc, std::string::npos);
-    EXPECT_NE(aee_loc, std::string::npos);
-    EXPECT_LT(zee_loc, aee_loc);
+    CHECK(std::string::npos != zee_loc);
+    CHECK(std::string::npos != aee_loc);
+    CHECK(aee_loc > zee_loc);
 }
 
-TEST(THelp, ValidatorsText) {
+TEST_CASE("THelp: ValidatorsText", "[help]") {
     CLI::App app;
 
     std::string filename;
@@ -979,52 +976,52 @@ TEST(THelp, ValidatorsText) {
     app.add_option("--f4", y)->check(CLI::Range(12));
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:FILE"));
-    EXPECT_THAT(help, HasSubstr("INT in [1 - 4]"));
-    EXPECT_THAT(help, HasSubstr("UINT:INT in [0 - 12]"));  // Loses UINT
+    CHECK_THAT(help, Contains("TEXT:FILE"));
+    CHECK_THAT(help, Contains("INT in [1 - 4]"));
+    CHECK_THAT(help, Contains("UINT:INT in [0 - 12]"));
 }
 
-TEST(THelp, ValidatorsTextCustom) {
+TEST_CASE("THelp: ValidatorsTextCustom", "[help]") {
     CLI::App app;
 
     std::string filename;
     app.add_option("--f1", filename)->check(CLI::ExistingFile.description("Existing file"));
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("Existing file"));
+    CHECK_THAT(help, Contains("Existing file"));
 }
 
-TEST(THelp, ValidatorsNonPathText) {
+TEST_CASE("THelp: ValidatorsNonPathText", "[help]") {
     CLI::App app;
 
     std::string filename;
     app.add_option("--f2", filename)->check(CLI::NonexistentPath);
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:PATH"));
+    CHECK_THAT(help, Contains("TEXT:PATH"));
 }
 
-TEST(THelp, ValidatorsDirText) {
+TEST_CASE("THelp: ValidatorsDirText", "[help]") {
     CLI::App app;
 
     std::string filename;
     app.add_option("--f2", filename)->check(CLI::ExistingDirectory);
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:DIR"));
+    CHECK_THAT(help, Contains("TEXT:DIR"));
 }
 
-TEST(THelp, ValidatorsPathText) {
+TEST_CASE("THelp: ValidatorsPathText", "[help]") {
     CLI::App app;
 
     std::string filename;
     app.add_option("--f2", filename)->check(CLI::ExistingPath);
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:PATH"));
+    CHECK_THAT(help, Contains("TEXT:PATH"));
 }
 
-TEST(THelp, CombinedValidatorsText) {
+TEST_CASE("THelp: CombinedValidatorsText", "[help]") {
     CLI::App app;
 
     std::string filename;
@@ -1034,12 +1031,12 @@ TEST(THelp, CombinedValidatorsText) {
     // Can't programmatically tell!
     // (Users can use ExistingPath, by the way)
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:(FILE) OR (DIR)"));
-    EXPECT_THAT(help, Not(HasSubstr("PATH")));
+    CHECK_THAT(help, Contains("TEXT:(FILE) OR (DIR)"));
+    CHECK_THAT(help, !Contains("PATH"));
 }
 
 // Don't do this in real life, please
-TEST(THelp, CombinedValidatorsPathyText) {
+TEST_CASE("THelp: CombinedValidatorsPathyText", "[help]") {
     CLI::App app;
 
     std::string filename;
@@ -1047,12 +1044,12 @@ TEST(THelp, CombinedValidatorsPathyText) {
 
     // Combining validators with the same type string is OK
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:"));
-    EXPECT_THAT(help, HasSubstr("PATH"));
+    CHECK_THAT(help, Contains("TEXT:"));
+    CHECK_THAT(help, Contains("PATH"));
 }
 
 // Don't do this in real life, please (and transform does nothing here)
-TEST(THelp, CombinedValidatorsPathyTextAsTransform) {
+TEST_CASE("THelp: CombinedValidatorsPathyTextAsTransform", "[help]") {
     CLI::App app;
 
     std::string filename;
@@ -1060,11 +1057,11 @@ TEST(THelp, CombinedValidatorsPathyTextAsTransform) {
 
     // Combining validators with the same type string is OK
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("TEXT:(PATH(existing)) OR (PATH"));
+    CHECK_THAT(help, Contains("TEXT:(PATH(existing)) OR (PATH"));
 }
 
 // #113 Part 2
-TEST(THelp, ChangingSet) {
+TEST_CASE("THelp: ChangingSet", "[help]") {
     CLI::App app;
 
     std::set<int> vals{1, 2, 3};
@@ -1073,19 +1070,19 @@ TEST(THelp, ChangingSet) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert(4);
     vals.erase(1);
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
-TEST(THelp, ChangingSetDefaulted) {
+TEST_CASE("THelp: ChangingSetDefaulted", "[help]") {
     CLI::App app;
 
     std::set<int> vals{1, 2, 3};
@@ -1094,19 +1091,19 @@ TEST(THelp, ChangingSetDefaulted) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert(4);
     vals.erase(1);
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
-TEST(THelp, ChangingCaselessSet) {
+TEST_CASE("THelp: ChangingCaselessSet", "[help]") {
     CLI::App app;
 
     std::set<std::string> vals{"1", "2", "3"};
@@ -1115,19 +1112,19 @@ TEST(THelp, ChangingCaselessSet) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert("4");
     vals.erase("1");
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
-TEST(THelp, ChangingCaselessSetDefaulted) {
+TEST_CASE("THelp: ChangingCaselessSetDefaulted", "[help]") {
     CLI::App app;
     app.option_defaults()->always_capture_default();
 
@@ -1137,21 +1134,21 @@ TEST(THelp, ChangingCaselessSetDefaulted) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, Not(HasSubstr("4")));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, !Contains("4"));
 
     vals.insert("4");
     vals.erase("1");
 
     help = app.help();
 
-    EXPECT_THAT(help, Not(HasSubstr("1")));
-    EXPECT_THAT(help, HasSubstr("4"));
+    CHECK_THAT(help, !Contains("1"));
+    CHECK_THAT(help, Contains("4"));
 }
 
 // New defaults tests (1.8)
 
-TEST(THelp, ChangingDefaults) {
+TEST_CASE("THelp: ChangingDefaults", "[help]") {
 
     CLI::App app;
 
@@ -1164,10 +1161,10 @@ TEST(THelp, ChangingDefaults) {
     x = {5, 6};
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("INT=[3,4] ..."));
+    CHECK_THAT(help, Contains("INT=[3,4] ..."));
 }
 
-TEST(THelp, ChangingDefaultsWithAutoCapture) {
+TEST_CASE("THelp: ChangingDefaultsWithAutoCapture", "[help]") {
 
     CLI::App app;
     app.option_defaults()->always_capture_default();
@@ -1178,10 +1175,10 @@ TEST(THelp, ChangingDefaultsWithAutoCapture) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("INT=[1,2] ..."));
+    CHECK_THAT(help, Contains("INT=[1,2] ..."));
 }
 
-TEST(THelp, FunctionDefaultString) {
+TEST_CASE("THelp: FunctionDefaultString", "[help]") {
 
     CLI::App app;
 
@@ -1193,44 +1190,44 @@ TEST(THelp, FunctionDefaultString) {
 
     std::string help = app.help();
 
-    EXPECT_THAT(help, HasSubstr("INT=Powerful"));
+    CHECK_THAT(help, Contains("INT=Powerful"));
 }
 
-TEST(TVersion, simple_flag) {
+TEST_CASE("TVersion: simple_flag", "[help]") {
 
     CLI::App app;
 
     app.set_version_flag("-v,--version", "VERSION " CLI11_VERSION);
 
     auto vers = app.version();
-    EXPECT_THAT(vers, HasSubstr("VERSION"));
+    CHECK_THAT(vers, Contains("VERSION"));
 
     app.set_version_flag();
-    EXPECT_TRUE(app.version().empty());
+    CHECK(app.version().empty());
 }
 
-TEST(TVersion, callback_flag) {
+TEST_CASE("TVersion: callback_flag", "[help]") {
 
     CLI::App app;
 
     app.set_version_flag("-v,--version", []() { return std::string("VERSION " CLI11_VERSION); });
 
     auto vers = app.version();
-    EXPECT_THAT(vers, HasSubstr("VERSION"));
+    CHECK_THAT(vers, Contains("VERSION"));
 
     app.set_version_flag("-v", []() { return std::string("VERSION2 " CLI11_VERSION); });
     vers = app.version();
-    EXPECT_THAT(vers, HasSubstr("VERSION"));
+    CHECK_THAT(vers, Contains("VERSION"));
 }
 
-TEST(TVersion, parse_throw) {
+TEST_CASE("TVersion: parse_throw", "[help]") {
 
     CLI::App app;
 
     app.set_version_flag("--version", CLI11_VERSION);
 
-    EXPECT_THROW(app.parse("--version"), CLI::CallForVersion);
-    EXPECT_THROW(app.parse("--version --arg2 5"), CLI::CallForVersion);
+    CHECK_THROWS_AS(app.parse("--version"), CLI::CallForVersion);
+    CHECK_THROWS_AS(app.parse("--version --arg2 5"), CLI::CallForVersion);
 
     auto ptr = app.get_version_ptr();
 
@@ -1238,10 +1235,10 @@ TEST(TVersion, parse_throw) {
     try {
         app.parse("--Version");
     } catch(const CLI::CallForVersion &v) {
-        EXPECT_STREQ(v.what(), CLI11_VERSION);
-        EXPECT_EQ(v.get_exit_code(), 0);
+        CHECK_THAT(CLI11_VERSION, Catch::Equals(v.what()));
+        CHECK(0 == v.get_exit_code());
         const auto &appc = app;
         auto cptr = appc.get_version_ptr();
-        EXPECT_EQ(cptr->count(), 1U);
+        CHECK(1U == cptr->count());
     }
 }
diff --git a/packages/CLI11/tests/HelpersTest.cpp b/packages/CLI11/tests/HelpersTest.cpp
index 993b8a9f436e3916c3d37df22bd43bd904eedac9..0c032fbce389899324025f4c8759e328e5ffaea2 100644
--- a/packages/CLI11/tests/HelpersTest.cpp
+++ b/packages/CLI11/tests/HelpersTest.cpp
@@ -8,11 +8,11 @@
 
 #include <array>
 #include <atomic>
-#include <climits>
 #include <complex>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
+#include <limits>
 #include <map>
 #include <string>
 #include <tuple>
@@ -25,146 +25,146 @@ class Streamable {};
 
 std::ostream &operator<<(std::ostream &out, const Streamable &) { return out << "Streamable"; }
 
-TEST(TypeTools, Streaming) {
+TEST_CASE("TypeTools: Streaming", "[helpers]") {
 
-    EXPECT_EQ(CLI::detail::to_string(NotStreamable{}), "");
+    CHECK("" == CLI::detail::to_string(NotStreamable{}));
 
-    EXPECT_EQ(CLI::detail::to_string(Streamable{}), "Streamable");
+    CHECK("Streamable" == CLI::detail::to_string(Streamable{}));
 
-    EXPECT_EQ(CLI::detail::to_string(5), "5");
+    CHECK("5" == CLI::detail::to_string(5));
 
-    EXPECT_EQ(CLI::detail::to_string("string"), std::string("string"));
-    EXPECT_EQ(CLI::detail::to_string(std::string("string")), std::string("string"));
+    CHECK(std::string("string") == CLI::detail::to_string("string"));
+    CHECK(std::string("string") == CLI::detail::to_string(std::string("string")));
 }
 
-TEST(TypeTools, tuple) {
-    EXPECT_FALSE(CLI::detail::is_tuple_like<int>::value);
-    EXPECT_FALSE(CLI::detail::is_tuple_like<std::vector<double>>::value);
+TEST_CASE("TypeTools: tuple", "[helpers]") {
+    CHECK_FALSE(CLI::detail::is_tuple_like<int>::value);
+    CHECK_FALSE(CLI::detail::is_tuple_like<std::vector<double>>::value);
     auto v = CLI::detail::is_tuple_like<std::tuple<double, int>>::value;
-    EXPECT_TRUE(v);
+    CHECK(v);
     v = CLI::detail::is_tuple_like<std::tuple<double, double, double>>::value;
-    EXPECT_TRUE(v);
+    CHECK(v);
 }
 
-TEST(TypeTools, type_size) {
+TEST_CASE("TypeTools: type_size", "[helpers]") {
     auto V = CLI::detail::type_count<int>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::type_count<void>::value;
-    EXPECT_EQ(V, 0);
+    CHECK(0 == V);
     V = CLI::detail::type_count<std::vector<double>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::type_count<std::tuple<double, int>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count<std::tuple<std::string, double, int>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     V = CLI::detail::type_count<std::array<std::string, 5>>::value;
-    EXPECT_EQ(V, 5);
+    CHECK(5 == V);
     V = CLI::detail::type_count<std::vector<std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count<std::tuple<std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count<std::tuple<int, std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     V = CLI::detail::type_count<std::tuple<std::pair<int, double>, std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 4);
+    CHECK(4 == V);
     // maps
     V = CLI::detail::type_count<std::map<int, std::pair<int, double>>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     // three level tuples
     V = CLI::detail::type_count<std::tuple<int, std::pair<int, std::tuple<int, double, std::string>>>>::value;
-    EXPECT_EQ(V, 5);
+    CHECK(5 == V);
     V = CLI::detail::type_count<std::pair<int, std::vector<int>>>::value;
-    EXPECT_GE(V, CLI::detail::expected_max_vector_size);
+    CHECK(CLI::detail::expected_max_vector_size <= V);
     V = CLI::detail::type_count<std::vector<std::vector<int>>>::value;
-    EXPECT_EQ(V, CLI::detail::expected_max_vector_size);
+    CHECK(CLI::detail::expected_max_vector_size == V);
 }
 
-TEST(TypeTools, type_size_min) {
+TEST_CASE("TypeTools: type_size_min", "[helpers]") {
     auto V = CLI::detail::type_count_min<int>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::type_count_min<void>::value;
-    EXPECT_EQ(V, 0);
+    CHECK(0 == V);
     V = CLI::detail::type_count_min<std::vector<double>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::type_count_min<std::tuple<double, int>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count_min<std::tuple<std::string, double, int>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     V = CLI::detail::type_count_min<std::array<std::string, 5>>::value;
-    EXPECT_EQ(V, 5);
+    CHECK(5 == V);
     V = CLI::detail::type_count_min<std::vector<std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count_min<std::tuple<std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count_min<std::tuple<int, std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     V = CLI::detail::type_count_min<std::tuple<std::pair<int, double>, std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, 4);
+    CHECK(4 == V);
     // maps
     V = CLI::detail::type_count_min<std::map<int, std::pair<int, double>>>::value;
-    EXPECT_EQ(V, 3);
+    CHECK(3 == V);
     // three level tuples
     V = CLI::detail::type_count_min<std::tuple<int, std::pair<int, std::tuple<int, double, std::string>>>>::value;
-    EXPECT_EQ(V, 5);
+    CHECK(5 == V);
     V = CLI::detail::type_count_min<std::pair<int, std::vector<int>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
     V = CLI::detail::type_count_min<std::vector<std::vector<int>>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::type_count_min<std::vector<std::vector<std::pair<int, int>>>>::value;
-    EXPECT_EQ(V, 2);
+    CHECK(2 == V);
 }
 
-TEST(TypeTools, expected_count) {
+TEST_CASE("TypeTools: expected_count", "[helpers]") {
     auto V = CLI::detail::expected_count<int>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::expected_count<void>::value;
-    EXPECT_EQ(V, 0);
+    CHECK(0 == V);
     V = CLI::detail::expected_count<std::vector<double>>::value;
-    EXPECT_EQ(V, CLI::detail::expected_max_vector_size);
+    CHECK(CLI::detail::expected_max_vector_size == V);
     V = CLI::detail::expected_count<std::tuple<double, int>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::expected_count<std::tuple<std::string, double, int>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::expected_count<std::array<std::string, 5>>::value;
-    EXPECT_EQ(V, 1);
+    CHECK(1 == V);
     V = CLI::detail::expected_count<std::vector<std::pair<std::string, double>>>::value;
-    EXPECT_EQ(V, CLI::detail::expected_max_vector_size);
+    CHECK(CLI::detail::expected_max_vector_size == V);
 }
 
-TEST(Split, SimpleByToken) {
+TEST_CASE("Split: SimpleByToken", "[helpers]") {
     auto out = CLI::detail::split("one.two.three", '.');
-    ASSERT_EQ(3u, out.size());
-    EXPECT_EQ("one", out.at(0));
-    EXPECT_EQ("two", out.at(1));
-    EXPECT_EQ("three", out.at(2));
+    REQUIRE(out.size() == 3u);
+    CHECK(out.at(0) == "one");
+    CHECK(out.at(1) == "two");
+    CHECK(out.at(2) == "three");
 }
 
-TEST(Split, Single) {
+TEST_CASE("Split: Single", "[helpers]") {
     auto out = CLI::detail::split("one", '.');
-    ASSERT_EQ(1u, out.size());
-    EXPECT_EQ("one", out.at(0));
+    REQUIRE(out.size() == 1u);
+    CHECK(out.at(0) == "one");
 }
 
-TEST(Split, Empty) {
+TEST_CASE("Split: Empty", "[helpers]") {
     auto out = CLI::detail::split("", '.');
-    ASSERT_EQ(1u, out.size());
-    EXPECT_EQ("", out.at(0));
+    REQUIRE(out.size() == 1u);
+    CHECK(out.at(0) == "");
 }
 
-TEST(String, InvalidName) {
-    EXPECT_TRUE(CLI::detail::valid_name_string("valid"));
-    EXPECT_FALSE(CLI::detail::valid_name_string("-invalid"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("va-li-d"));
-    EXPECT_FALSE(CLI::detail::valid_name_string("vali&d"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("_valid"));
-    EXPECT_FALSE(CLI::detail::valid_name_string("/valid"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("vali?d"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("@@@@"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("b@d2?"));
-    EXPECT_TRUE(CLI::detail::valid_name_string("2vali?d"));
+TEST_CASE("String: InvalidName", "[helpers]") {
+    CHECK(CLI::detail::valid_name_string("valid"));
+    CHECK_FALSE(CLI::detail::valid_name_string("-invalid"));
+    CHECK(CLI::detail::valid_name_string("va-li-d"));
+    CHECK_FALSE(CLI::detail::valid_name_string("vali&d"));
+    CHECK(CLI::detail::valid_name_string("_valid"));
+    CHECK_FALSE(CLI::detail::valid_name_string("/valid"));
+    CHECK(CLI::detail::valid_name_string("vali?d"));
+    CHECK(CLI::detail::valid_name_string("@@@@"));
+    CHECK(CLI::detail::valid_name_string("b@d2?"));
+    CHECK(CLI::detail::valid_name_string("2vali?d"));
 }
 
-TEST(StringTools, Modify) {
+TEST_CASE("StringTools: Modify", "[helpers]") {
     int cnt{0};
     std::string newString = CLI::detail::find_and_modify("======", "=", [&cnt](std::string &str, std::size_t index) {
         if((++cnt) % 2 == 0) {
@@ -172,10 +172,10 @@ TEST(StringTools, Modify) {
         }
         return index + 1;
     });
-    EXPECT_EQ(newString, "=:=:=:");
+    CHECK("=:=:=:" == newString);
 }
 
-TEST(StringTools, Modify2) {
+TEST_CASE("StringTools: Modify2", "[helpers]") {
     std::string newString =
         CLI::detail::find_and_modify("this is a string test", "is", [](std::string &str, std::size_t index) {
             if((index > 1) && (str[index - 1] != ' ')) {
@@ -184,296 +184,296 @@ TEST(StringTools, Modify2) {
             }
             return index + 1;
         });
-    EXPECT_EQ(newString, "that is a string test");
+    CHECK("that is a string test" == newString);
 }
 
-TEST(StringTools, Modify3) {
+TEST_CASE("StringTools: Modify3", "[helpers]") {
     // this picks up 3 sets of 3 after the 'b' then collapses the new first set
     std::string newString = CLI::detail::find_and_modify("baaaaaaaaaa", "aaa", [](std::string &str, std::size_t index) {
         str.erase(index, 3);
         str.insert(str.begin(), 'a');
         return 0u;
     });
-    EXPECT_EQ(newString, "aba");
-}
-
-TEST(StringTools, flagValues) {
-    EXPECT_EQ(CLI::detail::to_flag_value("0"), -1);
-    EXPECT_EQ(CLI::detail::to_flag_value("t"), 1);
-    EXPECT_EQ(CLI::detail::to_flag_value("1"), 1);
-    EXPECT_EQ(CLI::detail::to_flag_value("6"), 6);
-    EXPECT_EQ(CLI::detail::to_flag_value("-6"), -6);
-    EXPECT_EQ(CLI::detail::to_flag_value("false"), -1);
-    EXPECT_EQ(CLI::detail::to_flag_value("YES"), 1);
-    EXPECT_THROW(CLI::detail::to_flag_value("frog"), std::invalid_argument);
-    EXPECT_THROW(CLI::detail::to_flag_value("q"), std::invalid_argument);
-    EXPECT_EQ(CLI::detail::to_flag_value("NO"), -1);
-    EXPECT_EQ(CLI::detail::to_flag_value("475555233"), 475555233);
-}
-
-TEST(StringTools, Validation) {
-    EXPECT_TRUE(CLI::detail::isalpha(""));
-    EXPECT_TRUE(CLI::detail::isalpha("a"));
-    EXPECT_TRUE(CLI::detail::isalpha("abcd"));
-    EXPECT_FALSE(CLI::detail::isalpha("_"));
-    EXPECT_FALSE(CLI::detail::isalpha("2"));
-    EXPECT_FALSE(CLI::detail::isalpha("test test"));
-    EXPECT_FALSE(CLI::detail::isalpha("test "));
-    EXPECT_FALSE(CLI::detail::isalpha(" test"));
-    EXPECT_FALSE(CLI::detail::isalpha("test2"));
-}
-
-TEST(Trim, Various) {
+    CHECK("aba" == newString);
+}
+
+TEST_CASE("StringTools: flagValues", "[helpers]") {
+    CHECK(-1 == CLI::detail::to_flag_value("0"));
+    CHECK(1 == CLI::detail::to_flag_value("t"));
+    CHECK(1 == CLI::detail::to_flag_value("1"));
+    CHECK(6 == CLI::detail::to_flag_value("6"));
+    CHECK(-6 == CLI::detail::to_flag_value("-6"));
+    CHECK(-1 == CLI::detail::to_flag_value("false"));
+    CHECK(1 == CLI::detail::to_flag_value("YES"));
+    CHECK_THROWS_AS(CLI::detail::to_flag_value("frog"), std::invalid_argument);
+    CHECK_THROWS_AS(CLI::detail::to_flag_value("q"), std::invalid_argument);
+    CHECK(-1 == CLI::detail::to_flag_value("NO"));
+    CHECK(475555233 == CLI::detail::to_flag_value("475555233"));
+}
+
+TEST_CASE("StringTools: Validation", "[helpers]") {
+    CHECK(CLI::detail::isalpha(""));
+    CHECK(CLI::detail::isalpha("a"));
+    CHECK(CLI::detail::isalpha("abcd"));
+    CHECK_FALSE(CLI::detail::isalpha("_"));
+    CHECK_FALSE(CLI::detail::isalpha("2"));
+    CHECK_FALSE(CLI::detail::isalpha("test test"));
+    CHECK_FALSE(CLI::detail::isalpha("test "));
+    CHECK_FALSE(CLI::detail::isalpha(" test"));
+    CHECK_FALSE(CLI::detail::isalpha("test2"));
+}
+
+TEST_CASE("Trim: Various", "[helpers]") {
     std::string s1{"  sdlfkj sdflk sd s  "};
     std::string a1{"sdlfkj sdflk sd s"};
     CLI::detail::trim(s1);
-    EXPECT_EQ(a1, s1);
+    CHECK(s1 == a1);
 
     std::string s2{" a \t"};
     CLI::detail::trim(s2);
-    EXPECT_EQ("a", s2);
+    CHECK(s2 == "a");
 
     std::string s3{" a \n"};
     CLI::detail::trim(s3);
-    EXPECT_EQ("a", s3);
+    CHECK(s3 == "a");
 
     std::string s4{" a b "};
-    EXPECT_EQ("a b", CLI::detail::trim(s4));
+    CHECK(CLI::detail::trim(s4) == "a b");
 }
 
-TEST(Trim, VariousFilters) {
+TEST_CASE("Trim: VariousFilters", "[helpers]") {
     std::string s1{"  sdlfkj sdflk sd s  "};
     std::string a1{"sdlfkj sdflk sd s"};
     CLI::detail::trim(s1, " ");
-    EXPECT_EQ(a1, s1);
+    CHECK(s1 == a1);
 
     std::string s2{" a \t"};
     CLI::detail::trim(s2, " ");
-    EXPECT_EQ("a \t", s2);
+    CHECK(s2 == "a \t");
 
     std::string s3{"abdavda"};
     CLI::detail::trim(s3, "a");
-    EXPECT_EQ("bdavd", s3);
+    CHECK(s3 == "bdavd");
 
     std::string s4{"abcabcabc"};
-    EXPECT_EQ("cabcabc", CLI::detail::trim(s4, "ab"));
+    CHECK(CLI::detail::trim(s4, "ab") == "cabcabc");
 }
 
-TEST(Trim, TrimCopy) {
+TEST_CASE("Trim: TrimCopy", "[helpers]") {
     std::string orig{" cabc  "};
     std::string trimmed = CLI::detail::trim_copy(orig);
-    EXPECT_EQ("cabc", trimmed);
-    EXPECT_NE(orig, trimmed);
+    CHECK(trimmed == "cabc");
+    CHECK(trimmed != orig);
     CLI::detail::trim(orig);
-    EXPECT_EQ(trimmed, orig);
+    CHECK(orig == trimmed);
 
     orig = "abcabcabc";
     trimmed = CLI::detail::trim_copy(orig, "ab");
-    EXPECT_EQ("cabcabc", trimmed);
-    EXPECT_NE(orig, trimmed);
+    CHECK(trimmed == "cabcabc");
+    CHECK(trimmed != orig);
     CLI::detail::trim(orig, "ab");
-    EXPECT_EQ(trimmed, orig);
+    CHECK(orig == trimmed);
 }
 
-TEST(Validators, FileExists) {
+TEST_CASE("Validators: FileExists", "[helpers]") {
     std::string myfile{"TestFileNotUsed.txt"};
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK_FALSE(CLI::ExistingFile(myfile).empty());
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_TRUE(CLI::ExistingFile(myfile).empty());
+    CHECK(ok);
+    CHECK(CLI::ExistingFile(myfile).empty());
 
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK_FALSE(CLI::ExistingFile(myfile).empty());
 }
 
-TEST(Validators, FileNotExists) {
+TEST_CASE("Validators: FileNotExists", "[helpers]") {
     std::string myfile{"TestFileNotUsed.txt"};
-    EXPECT_TRUE(CLI::NonexistentPath(myfile).empty());
+    CHECK(CLI::NonexistentPath(myfile).empty());
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_FALSE(CLI::NonexistentPath(myfile).empty());
+    CHECK(ok);
+    CHECK_FALSE(CLI::NonexistentPath(myfile).empty());
 
     std::remove(myfile.c_str());
-    EXPECT_TRUE(CLI::NonexistentPath(myfile).empty());
+    CHECK(CLI::NonexistentPath(myfile).empty());
 }
 
-TEST(Validators, FileIsDir) {
+TEST_CASE("Validators: FileIsDir", "[helpers]") {
     std::string mydir{"../tests"};
-    EXPECT_NE(CLI::ExistingFile(mydir), "");
+    CHECK("" != CLI::ExistingFile(mydir));
 }
 
-TEST(Validators, DirectoryExists) {
+TEST_CASE("Validators: DirectoryExists", "[helpers]") {
     std::string mydir{"../tests"};
-    EXPECT_EQ(CLI::ExistingDirectory(mydir), "");
+    CHECK("" == CLI::ExistingDirectory(mydir));
 }
 
-TEST(Validators, DirectoryNotExists) {
+TEST_CASE("Validators: DirectoryNotExists", "[helpers]") {
     std::string mydir{"nondirectory"};
-    EXPECT_NE(CLI::ExistingDirectory(mydir), "");
+    CHECK("" != CLI::ExistingDirectory(mydir));
 }
 
-TEST(Validators, DirectoryIsFile) {
+TEST_CASE("Validators: DirectoryIsFile", "[helpers]") {
     std::string myfile{"TestFileNotUsed.txt"};
-    EXPECT_TRUE(CLI::NonexistentPath(myfile).empty());
+    CHECK(CLI::NonexistentPath(myfile).empty());
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_FALSE(CLI::ExistingDirectory(myfile).empty());
+    CHECK(ok);
+    CHECK_FALSE(CLI::ExistingDirectory(myfile).empty());
 
     std::remove(myfile.c_str());
-    EXPECT_TRUE(CLI::NonexistentPath(myfile).empty());
+    CHECK(CLI::NonexistentPath(myfile).empty());
 }
 
-TEST(Validators, PathExistsDir) {
+TEST_CASE("Validators: PathExistsDir", "[helpers]") {
     std::string mydir{"../tests"};
-    EXPECT_EQ(CLI::ExistingPath(mydir), "");
+    CHECK("" == CLI::ExistingPath(mydir));
 }
 
-TEST(Validators, PathExistsFile) {
+TEST_CASE("Validators: PathExistsFile", "[helpers]") {
     std::string myfile{"TestFileNotUsed.txt"};
-    EXPECT_FALSE(CLI::ExistingPath(myfile).empty());
+    CHECK_FALSE(CLI::ExistingPath(myfile).empty());
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
-    EXPECT_TRUE(CLI::ExistingPath(myfile).empty());
+    CHECK(ok);
+    CHECK(CLI::ExistingPath(myfile).empty());
 
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingPath(myfile).empty());
+    CHECK_FALSE(CLI::ExistingPath(myfile).empty());
 }
 
-TEST(Validators, PathNotExistsDir) {
+TEST_CASE("Validators: PathNotExistsDir", "[helpers]") {
     std::string mydir{"nonpath"};
-    EXPECT_NE(CLI::ExistingPath(mydir), "");
+    CHECK("" != CLI::ExistingPath(mydir));
 }
 
-TEST(Validators, IPValidate1) {
+TEST_CASE("Validators: IPValidate1", "[helpers]") {
     std::string ip = "1.1.1.1";
-    EXPECT_TRUE(CLI::ValidIPV4(ip).empty());
+    CHECK(CLI::ValidIPV4(ip).empty());
     ip = "224.255.0.1";
-    EXPECT_TRUE(CLI::ValidIPV4(ip).empty());
+    CHECK(CLI::ValidIPV4(ip).empty());
     ip = "-1.255.0.1";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
     ip = "1.256.0.1";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
     ip = "1.256.0.1";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
     ip = "aaa";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
     ip = "1.2.3.abc";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
     ip = "11.22";
-    EXPECT_FALSE(CLI::ValidIPV4(ip).empty());
+    CHECK_FALSE(CLI::ValidIPV4(ip).empty());
 }
 
-TEST(Validators, PositiveValidator) {
+TEST_CASE("Validators: PositiveValidator", "[helpers]") {
     std::string num = "1.1.1.1";
-    EXPECT_FALSE(CLI::PositiveNumber(num).empty());
+    CHECK_FALSE(CLI::PositiveNumber(num).empty());
     num = "1";
-    EXPECT_TRUE(CLI::PositiveNumber(num).empty());
+    CHECK(CLI::PositiveNumber(num).empty());
     num = "10000";
-    EXPECT_TRUE(CLI::PositiveNumber(num).empty());
+    CHECK(CLI::PositiveNumber(num).empty());
     num = "0";
-    EXPECT_FALSE(CLI::PositiveNumber(num).empty());
+    CHECK_FALSE(CLI::PositiveNumber(num).empty());
     num = "+0.5";
-    EXPECT_TRUE(CLI::PositiveNumber(num).empty());
+    CHECK(CLI::PositiveNumber(num).empty());
     num = "-1";
-    EXPECT_FALSE(CLI::PositiveNumber(num).empty());
+    CHECK_FALSE(CLI::PositiveNumber(num).empty());
     num = "-1.5";
-    EXPECT_FALSE(CLI::PositiveNumber(num).empty());
+    CHECK_FALSE(CLI::PositiveNumber(num).empty());
     num = "a";
-    EXPECT_FALSE(CLI::PositiveNumber(num).empty());
+    CHECK_FALSE(CLI::PositiveNumber(num).empty());
 }
 
-TEST(Validators, NonNegativeValidator) {
+TEST_CASE("Validators: NonNegativeValidator", "[helpers]") {
     std::string num = "1.1.1.1";
-    EXPECT_FALSE(CLI::NonNegativeNumber(num).empty());
+    CHECK_FALSE(CLI::NonNegativeNumber(num).empty());
     num = "1";
-    EXPECT_TRUE(CLI::NonNegativeNumber(num).empty());
+    CHECK(CLI::NonNegativeNumber(num).empty());
     num = "10000";
-    EXPECT_TRUE(CLI::NonNegativeNumber(num).empty());
+    CHECK(CLI::NonNegativeNumber(num).empty());
     num = "0";
-    EXPECT_TRUE(CLI::NonNegativeNumber(num).empty());
+    CHECK(CLI::NonNegativeNumber(num).empty());
     num = "+0.5";
-    EXPECT_TRUE(CLI::NonNegativeNumber(num).empty());
+    CHECK(CLI::NonNegativeNumber(num).empty());
     num = "-1";
-    EXPECT_FALSE(CLI::NonNegativeNumber(num).empty());
+    CHECK_FALSE(CLI::NonNegativeNumber(num).empty());
     num = "-1.5";
-    EXPECT_FALSE(CLI::NonNegativeNumber(num).empty());
+    CHECK_FALSE(CLI::NonNegativeNumber(num).empty());
     num = "a";
-    EXPECT_FALSE(CLI::NonNegativeNumber(num).empty());
+    CHECK_FALSE(CLI::NonNegativeNumber(num).empty());
 }
 
-TEST(Validators, NumberValidator) {
+TEST_CASE("Validators: NumberValidator", "[helpers]") {
     std::string num = "1.1.1.1";
-    EXPECT_FALSE(CLI::Number(num).empty());
+    CHECK_FALSE(CLI::Number(num).empty());
     num = "1.7";
-    EXPECT_TRUE(CLI::Number(num).empty());
+    CHECK(CLI::Number(num).empty());
     num = "10000";
-    EXPECT_TRUE(CLI::Number(num).empty());
+    CHECK(CLI::Number(num).empty());
     num = "-0.000";
-    EXPECT_TRUE(CLI::Number(num).empty());
+    CHECK(CLI::Number(num).empty());
     num = "+1.55";
-    EXPECT_TRUE(CLI::Number(num).empty());
+    CHECK(CLI::Number(num).empty());
     num = "a";
-    EXPECT_FALSE(CLI::Number(num).empty());
+    CHECK_FALSE(CLI::Number(num).empty());
 }
 
-TEST(Validators, CombinedAndRange) {
+TEST_CASE("Validators: CombinedAndRange", "[helpers]") {
     auto crange = CLI::Range(0, 12) & CLI::Range(4, 16);
-    EXPECT_TRUE(crange("4").empty());
-    EXPECT_TRUE(crange("12").empty());
-    EXPECT_TRUE(crange("7").empty());
+    CHECK(crange("4").empty());
+    CHECK(crange("12").empty());
+    CHECK(crange("7").empty());
 
-    EXPECT_FALSE(crange("-2").empty());
-    EXPECT_FALSE(crange("2").empty());
-    EXPECT_FALSE(crange("15").empty());
-    EXPECT_FALSE(crange("16").empty());
-    EXPECT_FALSE(crange("18").empty());
+    CHECK_FALSE(crange("-2").empty());
+    CHECK_FALSE(crange("2").empty());
+    CHECK_FALSE(crange("15").empty());
+    CHECK_FALSE(crange("16").empty());
+    CHECK_FALSE(crange("18").empty());
 }
 
-TEST(Validators, CombinedOrRange) {
+TEST_CASE("Validators: CombinedOrRange", "[helpers]") {
     auto crange = CLI::Range(0, 4) | CLI::Range(8, 12);
 
-    EXPECT_FALSE(crange("-2").empty());
-    EXPECT_TRUE(crange("2").empty());
-    EXPECT_FALSE(crange("5").empty());
-    EXPECT_TRUE(crange("8").empty());
-    EXPECT_TRUE(crange("12").empty());
-    EXPECT_FALSE(crange("16").empty());
+    CHECK_FALSE(crange("-2").empty());
+    CHECK(crange("2").empty());
+    CHECK_FALSE(crange("5").empty());
+    CHECK(crange("8").empty());
+    CHECK(crange("12").empty());
+    CHECK_FALSE(crange("16").empty());
 }
 
-TEST(Validators, CombinedPaths) {
+TEST_CASE("Validators: CombinedPaths", "[helpers]") {
     std::string myfile{"TestFileNotUsed.txt"};
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK_FALSE(CLI::ExistingFile(myfile).empty());
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
+    CHECK(ok);
 
     std::string dir{"../tests"};
     std::string notpath{"nondirectory"};
 
     auto path_or_dir = CLI::ExistingPath | CLI::ExistingDirectory;
-    EXPECT_TRUE(path_or_dir(dir).empty());
-    EXPECT_TRUE(path_or_dir(myfile).empty());
-    EXPECT_FALSE(path_or_dir(notpath).empty());
+    CHECK(path_or_dir(dir).empty());
+    CHECK(path_or_dir(myfile).empty());
+    CHECK_FALSE(path_or_dir(notpath).empty());
 
     auto file_or_dir = CLI::ExistingFile | CLI::ExistingDirectory;
-    EXPECT_TRUE(file_or_dir(dir).empty());
-    EXPECT_TRUE(file_or_dir(myfile).empty());
-    EXPECT_FALSE(file_or_dir(notpath).empty());
+    CHECK(file_or_dir(dir).empty());
+    CHECK(file_or_dir(myfile).empty());
+    CHECK_FALSE(file_or_dir(notpath).empty());
 
     auto path_and_dir = CLI::ExistingPath & CLI::ExistingDirectory;
-    EXPECT_TRUE(path_and_dir(dir).empty());
-    EXPECT_FALSE(path_and_dir(myfile).empty());
-    EXPECT_FALSE(path_and_dir(notpath).empty());
+    CHECK(path_and_dir(dir).empty());
+    CHECK_FALSE(path_and_dir(myfile).empty());
+    CHECK_FALSE(path_and_dir(notpath).empty());
 
     auto path_and_file = CLI::ExistingFile & CLI::ExistingDirectory;
-    EXPECT_FALSE(path_and_file(dir).empty());
-    EXPECT_FALSE(path_and_file(myfile).empty());
-    EXPECT_FALSE(path_and_file(notpath).empty());
+    CHECK_FALSE(path_and_file(dir).empty());
+    CHECK_FALSE(path_and_file(myfile).empty());
+    CHECK_FALSE(path_and_file(notpath).empty());
 
     std::remove(myfile.c_str());
-    EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+    CHECK_FALSE(CLI::ExistingFile(myfile).empty());
 }
 
-TEST(Validators, ProgramNameSplit) {
+TEST_CASE("Validators: ProgramNameSplit", "[helpers]") {
     TempFile myfile{"program_name1.exe"};
     {
         std::ofstream out{myfile};
@@ -481,8 +481,8 @@ TEST(Validators, ProgramNameSplit) {
     }
     auto res =
         CLI::detail::split_program_name(std::string("./") + std::string(myfile) + " this is a bunch of extra stuff  ");
-    EXPECT_EQ(res.first, std::string("./") + std::string(myfile));
-    EXPECT_EQ(res.second, "this is a bunch of extra stuff");
+    CHECK(std::string("./") + std::string(myfile) == res.first);
+    CHECK("this is a bunch of extra stuff" == res.second);
 
     TempFile myfile2{"program name1.exe"};
     {
@@ -491,257 +491,257 @@ TEST(Validators, ProgramNameSplit) {
     }
     res = CLI::detail::split_program_name(std::string("   ") + std::string("./") + std::string(myfile2) +
                                           "      this is a bunch of extra stuff  ");
-    EXPECT_EQ(res.first, std::string("./") + std::string(myfile2));
-    EXPECT_EQ(res.second, "this is a bunch of extra stuff");
+    CHECK(std::string("./") + std::string(myfile2) == res.first);
+    CHECK("this is a bunch of extra stuff" == res.second);
 
     res = CLI::detail::split_program_name("./program_name    this is a bunch of extra stuff  ");
-    EXPECT_EQ(res.first, "./program_name");  // test sectioning of first argument even if it can't detect the file
-    EXPECT_EQ(res.second, "this is a bunch of extra stuff");
+    CHECK("./program_name" == res.first);
+    CHECK("this is a bunch of extra stuff" == res.second);
 
     res = CLI::detail::split_program_name(std::string("  ./") + std::string(myfile) + "    ");
-    EXPECT_EQ(res.first, std::string("./") + std::string(myfile));
-    EXPECT_TRUE(res.second.empty());
+    CHECK(std::string("./") + std::string(myfile) == res.first);
+    CHECK(res.second.empty());
 }
 
-TEST(CheckedMultiply, Int) {
+TEST_CASE("CheckedMultiply: Int", "[helpers]") {
     int a{10};
     int b{-20};
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, -200);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(-200 == a);
 
     a = 0;
     b = -20;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, 0);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0 == a);
 
     a = 20;
     b = 0;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, 0);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0 == a);
 
     a = std::numeric_limits<int>::max();
     b = 1;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::max());
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::max() == a);
 
     a = std::numeric_limits<int>::max();
     b = 2;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::max());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::max() == a);
 
     a = std::numeric_limits<int>::max();
     b = -1;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, -std::numeric_limits<int>::max());
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(-std::numeric_limits<int>::max() == a);
 
     a = std::numeric_limits<int>::max();
     b = std::numeric_limits<int>::max();
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::max());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::max() == a);
 
     a = std::numeric_limits<int>::min();
     b = std::numeric_limits<int>::max();
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::min());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::min() == a);
 
     a = std::numeric_limits<int>::min();
     b = 1;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::min());
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::min() == a);
 
     a = std::numeric_limits<int>::min();
     b = -1;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::min());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::min() == a);
 
     b = std::numeric_limits<int>::min();
     a = -1;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, -1);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(-1 == a);
 
     a = std::numeric_limits<int>::min() / 100;
     b = 99;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::min() / 100 * 99);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::min() / 100 * 99 == a);
 
     a = std::numeric_limits<int>::min() / 100;
     b = -101;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<int>::min() / 100);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<int>::min() / 100 == a);
     a = 2;
     b = std::numeric_limits<int>::min() / 2;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(CLI::detail::checked_multiply(a, b));
     a = std::numeric_limits<int>::min() / 2;
     b = 2;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(CLI::detail::checked_multiply(a, b));
 
     a = 4;
     b = std::numeric_limits<int>::min() / 4;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(CLI::detail::checked_multiply(a, b));
 
     a = 48;
     b = std::numeric_limits<int>::min() / 48;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(CLI::detail::checked_multiply(a, b));
 }
 
-TEST(CheckedMultiply, SizeT) {
+TEST_CASE("CheckedMultiply: SizeT", "[helpers]") {
     std::size_t a = 10;
     std::size_t b = 20;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, 200u);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(200u == a);
 
     a = 0u;
     b = 20u;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, 0u);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0u == a);
 
     a = 20u;
     b = 0u;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, 0u);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0u == a);
 
     a = std::numeric_limits<std::size_t>::max();
     b = 1u;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<std::size_t>::max());
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<std::size_t>::max() == a);
 
     a = std::numeric_limits<std::size_t>::max();
     b = 2u;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<std::size_t>::max());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<std::size_t>::max() == a);
 
     a = std::numeric_limits<std::size_t>::max();
     b = std::numeric_limits<std::size_t>::max();
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<std::size_t>::max());
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<std::size_t>::max() == a);
 
     a = std::numeric_limits<std::size_t>::max() / 100;
     b = 99u;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_EQ(a, std::numeric_limits<std::size_t>::max() / 100u * 99u);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<std::size_t>::max() / 100u * 99u == a);
 }
 
-TEST(CheckedMultiply, Float) {
+TEST_CASE("CheckedMultiply: Float", "[helpers]") {
     float a{10.0F};
     float b{20.0F};
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, 200);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(200 == Approx(a));
 
     a = 0.0F;
     b = 20.0F;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, 0);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0 == Approx(a));
 
     a = INFINITY;
     b = 20.0F;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, INFINITY);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(INFINITY == Approx(a));
 
     a = 2.0F;
     b = -INFINITY;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, -INFINITY);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(-INFINITY == Approx(a));
 
     a = std::numeric_limits<float>::max() / 100.0F;
     b = 1.0F;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, std::numeric_limits<float>::max() / 100.0F);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<float>::max() / 100.0F == Approx(a));
 
     a = std::numeric_limits<float>::max() / 100.0F;
     b = 99.0F;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, std::numeric_limits<float>::max() / 100.0F * 99.0F);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<float>::max() / 100.0F * 99.0F == Approx(a));
 
     a = std::numeric_limits<float>::max() / 100.0F;
     b = 101;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, std::numeric_limits<float>::max() / 100.0F);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<float>::max() / 100.0F == Approx(a));
 
     a = std::numeric_limits<float>::max() / 100.0F;
     b = -99;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, std::numeric_limits<float>::max() / 100.0F * -99.0F);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<float>::max() / 100.0F * -99.0F == Approx(a));
 
     a = std::numeric_limits<float>::max() / 100.0F;
     b = -101;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_FLOAT_EQ(a, std::numeric_limits<float>::max() / 100.0F);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<float>::max() / 100.0F == Approx(a));
 }
 
-TEST(CheckedMultiply, Double) {
+TEST_CASE("CheckedMultiply: Double", "[helpers]") {
     double a{10.0F};
     double b{20.0F};
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, 200);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(200 == Approx(a));
 
     a = 0;
     b = 20;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, 0);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(0 == Approx(a));
 
     a = INFINITY;
     b = 20;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, INFINITY);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(INFINITY == Approx(a));
 
     a = 2;
     b = -INFINITY;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, -INFINITY);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(-INFINITY == Approx(a));
 
     a = std::numeric_limits<double>::max() / 100;
     b = 1;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, std::numeric_limits<double>::max() / 100);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<double>::max() / 100 == Approx(a));
 
     a = std::numeric_limits<double>::max() / 100;
     b = 99;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, std::numeric_limits<double>::max() / 100 * 99);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<double>::max() / 100 * 99 == Approx(a));
 
     a = std::numeric_limits<double>::max() / 100;
     b = 101;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, std::numeric_limits<double>::max() / 100);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<double>::max() / 100 == Approx(a));
 
     a = std::numeric_limits<double>::max() / 100;
     b = -99;
-    ASSERT_TRUE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, std::numeric_limits<double>::max() / 100 * -99);
+    REQUIRE(CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<double>::max() / 100 * -99 == Approx(a));
 
     a = std::numeric_limits<double>::max() / 100;
     b = -101;
-    ASSERT_FALSE(CLI::detail::checked_multiply(a, b));
-    ASSERT_DOUBLE_EQ(a, std::numeric_limits<double>::max() / 100);
+    REQUIRE(!CLI::detail::checked_multiply(a, b));
+    REQUIRE(std::numeric_limits<double>::max() / 100 == Approx(a));
 }
 
 // Yes, this is testing an app_helper :)
-TEST(AppHelper, TempfileCreated) {
+TEST_CASE("AppHelper: TempfileCreated", "[helpers]") {
     std::string name = "TestFileNotUsed.txt";
     {
         TempFile myfile{name};
 
-        EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+        CHECK_FALSE(CLI::ExistingFile(myfile).empty());
 
         bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-        EXPECT_TRUE(ok);
-        EXPECT_TRUE(CLI::ExistingFile(name).empty());
-        EXPECT_THROW({ TempFile otherfile(name); }, std::runtime_error);
+        CHECK(ok);
+        CHECK(CLI::ExistingFile(name).empty());
+        CHECK_THROWS_AS([&]() { TempFile otherfile(name); }(), std::runtime_error);
     }
-    EXPECT_FALSE(CLI::ExistingFile(name).empty());
+    CHECK_FALSE(CLI::ExistingFile(name).empty());
 }
 
-TEST(AppHelper, TempfileNotCreated) {
+TEST_CASE("AppHelper: TempfileNotCreated", "[helpers]") {
     std::string name = "TestFileNotUsed.txt";
     {
         TempFile myfile{name};
 
-        EXPECT_FALSE(CLI::ExistingFile(myfile).empty());
+        CHECK_FALSE(CLI::ExistingFile(myfile).empty());
     }
-    EXPECT_FALSE(CLI::ExistingFile(name).empty());
+    CHECK_FALSE(CLI::ExistingFile(name).empty());
 }
 
-TEST(AppHelper, Ofstream) {
+TEST_CASE("AppHelper: Ofstream", "[helpers]") {
 
     std::string name = "TestFileNotUsed.txt";
     {
@@ -752,174 +752,178 @@ TEST(AppHelper, Ofstream) {
             out << "this is output" << std::endl;
         }
 
-        EXPECT_TRUE(CLI::ExistingFile(myfile).empty());
+        CHECK(CLI::ExistingFile(myfile).empty());
     }
-    EXPECT_FALSE(CLI::ExistingFile(name).empty());
+    CHECK_FALSE(CLI::ExistingFile(name).empty());
 }
 
-TEST(Split, StringList) {
+TEST_CASE("Split: StringList", "[helpers]") {
 
     std::vector<std::string> results{"a", "long", "--lone", "-q"};
-    EXPECT_EQ(results, CLI::detail::split_names("a,long,--lone,-q"));
-    EXPECT_EQ(results, CLI::detail::split_names(" a, long, --lone, -q"));
-    EXPECT_EQ(results, CLI::detail::split_names(" a , long , --lone , -q "));
-    EXPECT_EQ(results, CLI::detail::split_names("   a  ,  long  ,  --lone  ,    -q  "));
+    CHECK(CLI::detail::split_names("a,long,--lone,-q") == results);
+    CHECK(CLI::detail::split_names(" a, long, --lone, -q") == results);
+    CHECK(CLI::detail::split_names(" a , long , --lone , -q ") == results);
+    CHECK(CLI::detail::split_names("   a  ,  long  ,  --lone  ,    -q  ") == results);
 
-    EXPECT_EQ(std::vector<std::string>({"one"}), CLI::detail::split_names("one"));
+    CHECK(CLI::detail::split_names("one") == std::vector<std::string>({"one"}));
 }
 
-TEST(RegEx, Shorts) {
+TEST_CASE("RegEx: Shorts", "[helpers]") {
     std::string name, value;
 
-    EXPECT_TRUE(CLI::detail::split_short("-a", name, value));
-    EXPECT_EQ("a", name);
-    EXPECT_EQ("", value);
+    CHECK(CLI::detail::split_short("-a", name, value));
+    CHECK(name == "a");
+    CHECK(value == "");
 
-    EXPECT_TRUE(CLI::detail::split_short("-B", name, value));
-    EXPECT_EQ("B", name);
-    EXPECT_EQ("", value);
+    CHECK(CLI::detail::split_short("-B", name, value));
+    CHECK(name == "B");
+    CHECK(value == "");
 
-    EXPECT_TRUE(CLI::detail::split_short("-cc", name, value));
-    EXPECT_EQ("c", name);
-    EXPECT_EQ("c", value);
+    CHECK(CLI::detail::split_short("-cc", name, value));
+    CHECK(name == "c");
+    CHECK(value == "c");
 
-    EXPECT_TRUE(CLI::detail::split_short("-simple", name, value));
-    EXPECT_EQ("s", name);
-    EXPECT_EQ("imple", value);
+    CHECK(CLI::detail::split_short("-simple", name, value));
+    CHECK(name == "s");
+    CHECK(value == "imple");
 
-    EXPECT_FALSE(CLI::detail::split_short("--a", name, value));
-    EXPECT_FALSE(CLI::detail::split_short("--thing", name, value));
-    EXPECT_FALSE(CLI::detail::split_short("--", name, value));
-    EXPECT_FALSE(CLI::detail::split_short("something", name, value));
-    EXPECT_FALSE(CLI::detail::split_short("s", name, value));
+    CHECK_FALSE(CLI::detail::split_short("--a", name, value));
+    CHECK_FALSE(CLI::detail::split_short("--thing", name, value));
+    CHECK_FALSE(CLI::detail::split_short("--", name, value));
+    CHECK_FALSE(CLI::detail::split_short("something", name, value));
+    CHECK_FALSE(CLI::detail::split_short("s", name, value));
 }
 
-TEST(RegEx, Longs) {
+TEST_CASE("RegEx: Longs", "[helpers]") {
     std::string name, value;
 
-    EXPECT_TRUE(CLI::detail::split_long("--a", name, value));
-    EXPECT_EQ("a", name);
-    EXPECT_EQ("", value);
+    CHECK(CLI::detail::split_long("--a", name, value));
+    CHECK(name == "a");
+    CHECK(value == "");
 
-    EXPECT_TRUE(CLI::detail::split_long("--thing", name, value));
-    EXPECT_EQ("thing", name);
-    EXPECT_EQ("", value);
+    CHECK(CLI::detail::split_long("--thing", name, value));
+    CHECK(name == "thing");
+    CHECK(value == "");
 
-    EXPECT_TRUE(CLI::detail::split_long("--some=thing", name, value));
-    EXPECT_EQ("some", name);
-    EXPECT_EQ("thing", value);
+    CHECK(CLI::detail::split_long("--some=thing", name, value));
+    CHECK(name == "some");
+    CHECK(value == "thing");
 
-    EXPECT_FALSE(CLI::detail::split_long("-a", name, value));
-    EXPECT_FALSE(CLI::detail::split_long("-things", name, value));
-    EXPECT_FALSE(CLI::detail::split_long("Q", name, value));
-    EXPECT_FALSE(CLI::detail::split_long("--", name, value));
+    CHECK_FALSE(CLI::detail::split_long("-a", name, value));
+    CHECK_FALSE(CLI::detail::split_long("-things", name, value));
+    CHECK_FALSE(CLI::detail::split_long("Q", name, value));
+    CHECK_FALSE(CLI::detail::split_long("--", name, value));
 }
 
-TEST(RegEx, SplittingNew) {
+TEST_CASE("RegEx: SplittingNew", "[helpers]") {
 
     std::vector<std::string> shorts;
     std::vector<std::string> longs;
     std::string pname;
 
-    EXPECT_NO_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"--long", "-s", "-q", "--also-long"}));
-    EXPECT_EQ(std::vector<std::string>({"long", "also-long"}), longs);
-    EXPECT_EQ(std::vector<std::string>({"s", "q"}), shorts);
-    EXPECT_EQ("", pname);
+    CHECK_NOTHROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"--long", "-s", "-q", "--also-long"}));
+    CHECK(longs == std::vector<std::string>({"long", "also-long"}));
+    CHECK(shorts == std::vector<std::string>({"s", "q"}));
+    CHECK(pname == "");
 
-    EXPECT_NO_THROW(std::tie(shorts, longs, pname) =
-                        CLI::detail::get_names({"--long", "", "-s", "-q", "", "--also-long"}));
-    EXPECT_EQ(std::vector<std::string>({"long", "also-long"}), longs);
-    EXPECT_EQ(std::vector<std::string>({"s", "q"}), shorts);
+    std::tie(shorts, longs, pname) = CLI::detail::get_names({"--long", "", "-s", "-q", "", "--also-long"});
+    CHECK(longs == std::vector<std::string>({"long", "also-long"}));
+    CHECK(shorts == std::vector<std::string>({"s", "q"}));
 
-    EXPECT_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"-"}), CLI::BadNameString);
-    EXPECT_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"--"}), CLI::BadNameString);
-    EXPECT_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"-hi"}), CLI::BadNameString);
-    EXPECT_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"---hi"}), CLI::BadNameString);
-    EXPECT_THROW(std::tie(shorts, longs, pname) = CLI::detail::get_names({"one", "two"}), CLI::BadNameString);
+    CHECK_THROWS_AS([&]() { std::tie(shorts, longs, pname) = CLI::detail::get_names({"-"}); }(), CLI::BadNameString);
+    CHECK_THROWS_AS([&]() { std::tie(shorts, longs, pname) = CLI::detail::get_names({"--"}); }(), CLI::BadNameString);
+    CHECK_THROWS_AS([&]() { std::tie(shorts, longs, pname) = CLI::detail::get_names({"-hi"}); }(), CLI::BadNameString);
+    CHECK_THROWS_AS([&]() { std::tie(shorts, longs, pname) = CLI::detail::get_names({"---hi"}); }(),
+                    CLI::BadNameString);
+    CHECK_THROWS_AS(
+        [&]() {
+            std::tie(shorts, longs, pname) = CLI::detail::get_names({"one", "two"});
+        }(),
+        CLI::BadNameString);
 }
 
-TEST(String, ToLower) { EXPECT_EQ("one and two", CLI::detail::to_lower("one And TWO")); }
+TEST_CASE("String: ToLower", "[helpers]") { CHECK("one and two" == CLI::detail::to_lower("one And TWO")); }
 
-TEST(Join, Forward) {
+TEST_CASE("Join: Forward", "[helpers]") {
     std::vector<std::string> val{{"one", "two", "three"}};
-    EXPECT_EQ("one,two,three", CLI::detail::join(val));
-    EXPECT_EQ("one;two;three", CLI::detail::join(val, ";"));
+    CHECK(CLI::detail::join(val) == "one,two,three");
+    CHECK(CLI::detail::join(val, ";") == "one;two;three");
 }
 
-TEST(Join, Backward) {
+TEST_CASE("Join: Backward", "[helpers]") {
     std::vector<std::string> val{{"three", "two", "one"}};
-    EXPECT_EQ("one,two,three", CLI::detail::rjoin(val));
-    EXPECT_EQ("one;two;three", CLI::detail::rjoin(val, ";"));
+    CHECK(CLI::detail::rjoin(val) == "one,two,three");
+    CHECK(CLI::detail::rjoin(val, ";") == "one;two;three");
 }
 
-TEST(SplitUp, Simple) {
+TEST_CASE("SplitUp: Simple", "[helpers]") {
     std::vector<std::string> oput = {"one", "two three"};
     std::string orig{R"(one "two three")"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 }
 
-TEST(SplitUp, SimpleDifferentQuotes) {
+TEST_CASE("SplitUp: SimpleDifferentQuotes", "[helpers]") {
     std::vector<std::string> oput = {"one", "two three"};
     std::string orig{R"(one `two three`)"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 }
 
-TEST(SplitUp, SimpleDifferentQuotes2) {
+TEST_CASE("SplitUp: SimpleDifferentQuotes2", "[helpers]") {
     std::vector<std::string> oput = {"one", "two three"};
     std::string orig{R"(one 'two three')"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 }
 
-TEST(SplitUp, Layered) {
+TEST_CASE("SplitUp: Layered", "[helpers]") {
     std::vector<std::string> output = {R"(one 'two three')"};
     std::string orig{R"("one 'two three'")"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(output, result);
+    CHECK(result == output);
 }
 
-TEST(SplitUp, Spaces) {
+TEST_CASE("SplitUp: Spaces", "[helpers]") {
     std::vector<std::string> oput = {"one", "  two three"};
     std::string orig{R"(  one  "  two three" )"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 }
 
-TEST(SplitUp, BadStrings) {
+TEST_CASE("SplitUp: BadStrings", "[helpers]") {
     std::vector<std::string> oput = {"one", "  two three"};
     std::string orig{R"(  one  "  two three )"};
     std::vector<std::string> result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 
     oput = {"one", "  two three"};
     orig = R"(  one  '  two three )";
     result = CLI::detail::split_up(orig);
-    EXPECT_EQ(oput, result);
+    CHECK(result == oput);
 }
 
-TEST(Types, TypeName) {
+TEST_CASE("Types: TypeName", "[helpers]") {
     std::string int_name = CLI::detail::type_name<int>();
-    EXPECT_EQ("INT", int_name);
+    CHECK(int_name == "INT");
 
     std::string int2_name = CLI::detail::type_name<std::int16_t>();
-    EXPECT_EQ("INT", int2_name);
+    CHECK(int2_name == "INT");
 
     std::string uint_name = CLI::detail::type_name<unsigned char>();
-    EXPECT_EQ("UINT", uint_name);
+    CHECK(uint_name == "UINT");
 
     std::string float_name = CLI::detail::type_name<double>();
-    EXPECT_EQ("FLOAT", float_name);
+    CHECK(float_name == "FLOAT");
 
     std::string char_name = CLI::detail::type_name<char>();
-    EXPECT_EQ("CHAR", char_name);
+    CHECK(char_name == "CHAR");
 
     std::string vector_name = CLI::detail::type_name<std::vector<int>>();
-    EXPECT_EQ("INT", vector_name);
+    CHECK(vector_name == "INT");
 
     vector_name = CLI::detail::type_name<std::vector<double>>();
-    EXPECT_EQ("FLOAT", vector_name);
+    CHECK(vector_name == "FLOAT");
 
     static_assert(CLI::detail::classify_object<std::pair<int, std::string>>::value ==
                       CLI::detail::object_category::tuple_value,
@@ -930,228 +934,228 @@ TEST(Types, TypeName) {
                   "tuple<string,double> does not read like a tuple");
 
     std::string pair_name = CLI::detail::type_name<std::vector<std::pair<int, std::string>>>();
-    EXPECT_EQ("[INT,TEXT]", pair_name);
+    CHECK(pair_name == "[INT,TEXT]");
 
     vector_name = CLI::detail::type_name<std::vector<std::vector<unsigned char>>>();
-    EXPECT_EQ("UINT", vector_name);
+    CHECK(vector_name == "UINT");
 
     auto vclass = CLI::detail::classify_object<std::vector<std::vector<unsigned char>>>::value;
-    EXPECT_EQ(vclass, CLI::detail::object_category::container_value);
+    CHECK(CLI::detail::object_category::container_value == vclass);
 
     auto tclass = CLI::detail::classify_object<std::tuple<double>>::value;
-    EXPECT_EQ(tclass, CLI::detail::object_category::number_constructible);
+    CHECK(CLI::detail::object_category::number_constructible == tclass);
 
     std::string tuple_name = CLI::detail::type_name<std::tuple<double>>();
-    EXPECT_EQ("FLOAT", tuple_name);
+    CHECK(tuple_name == "FLOAT");
 
     static_assert(CLI::detail::classify_object<std::tuple<int, std::string>>::value ==
                       CLI::detail::object_category::tuple_value,
                   "tuple<int,string> does not read like a tuple");
     tuple_name = CLI::detail::type_name<std::tuple<int, std::string>>();
-    EXPECT_EQ("[INT,TEXT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT]");
 
     tuple_name = CLI::detail::type_name<std::tuple<const int, std::string>>();
-    EXPECT_EQ("[INT,TEXT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT]");
 
     tuple_name = CLI::detail::type_name<const std::tuple<int, std::string>>();
-    EXPECT_EQ("[INT,TEXT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT]");
 
     tuple_name = CLI::detail::type_name<std::tuple<std::string, double>>();
-    EXPECT_EQ("[TEXT,FLOAT]", tuple_name);
+    CHECK(tuple_name == "[TEXT,FLOAT]");
 
     tuple_name = CLI::detail::type_name<const std::tuple<std::string, double>>();
-    EXPECT_EQ("[TEXT,FLOAT]", tuple_name);
+    CHECK(tuple_name == "[TEXT,FLOAT]");
 
     tuple_name = CLI::detail::type_name<std::tuple<int, std::string, double>>();
-    EXPECT_EQ("[INT,TEXT,FLOAT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT,FLOAT]");
 
     tuple_name = CLI::detail::type_name<std::tuple<int, std::string, double, unsigned int>>();
-    EXPECT_EQ("[INT,TEXT,FLOAT,UINT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT,FLOAT,UINT]");
 
     tuple_name = CLI::detail::type_name<std::tuple<int, std::string, double, unsigned int, std::string>>();
-    EXPECT_EQ("[INT,TEXT,FLOAT,UINT,TEXT]", tuple_name);
+    CHECK(tuple_name == "[INT,TEXT,FLOAT,UINT,TEXT]");
 
     tuple_name = CLI::detail::type_name<std::array<int, 10>>();
-    EXPECT_EQ("[INT,INT,INT,INT,INT,INT,INT,INT,INT,INT]", tuple_name);
+    CHECK(tuple_name == "[INT,INT,INT,INT,INT,INT,INT,INT,INT,INT]");
 
     std::string text_name = CLI::detail::type_name<std::string>();
-    EXPECT_EQ("TEXT", text_name);
+    CHECK(text_name == "TEXT");
 
     std::string text2_name = CLI::detail::type_name<char *>();
-    EXPECT_EQ("TEXT", text2_name);
+    CHECK(text2_name == "TEXT");
 
     enum class test { test1, test2, test3 };
     std::string enum_name = CLI::detail::type_name<test>();
-    EXPECT_EQ("ENUM", enum_name);
+    CHECK(enum_name == "ENUM");
 
     vclass = CLI::detail::classify_object<std::tuple<test>>::value;
-    EXPECT_EQ(vclass, CLI::detail::object_category::tuple_value);
+    CHECK(CLI::detail::object_category::tuple_value == vclass);
     static_assert(CLI::detail::classify_object<std::tuple<test>>::value == CLI::detail::object_category::tuple_value,
                   "tuple<test> does not classify as a tuple");
     std::string enum_name2 = CLI::detail::type_name<std::tuple<test>>();
-    EXPECT_EQ("ENUM", enum_name2);
+    CHECK(enum_name2 == "ENUM");
     std::string umapName = CLI::detail::type_name<std::unordered_map<int, std::tuple<std::string, double>>>();
-    EXPECT_EQ("[INT,[TEXT,FLOAT]]", umapName);
+    CHECK(umapName == "[INT,[TEXT,FLOAT]]");
 
     vclass = CLI::detail::classify_object<std::atomic<int>>::value;
 }
 
-TEST(Types, OverflowSmall) {
+TEST_CASE("Types: OverflowSmall", "[helpers]") {
     signed char x;
-    auto strmax = std::to_string(SCHAR_MAX + 1);
-    EXPECT_FALSE(CLI::detail::lexical_cast(strmax, x));
+    auto strmax = std::to_string(std::numeric_limits<signed char>::max() + 1);
+    CHECK_FALSE(CLI::detail::lexical_cast(strmax, x));
 
     unsigned char y;
-    strmax = std::to_string(UINT8_MAX + 1);
-    EXPECT_FALSE(CLI::detail::lexical_cast(strmax, y));
+    strmax = std::to_string(std::numeric_limits<unsigned char>::max() + 1);
+    CHECK_FALSE(CLI::detail::lexical_cast(strmax, y));
 }
 
-TEST(Types, LexicalCastInt) {
+TEST_CASE("Types: LexicalCastInt", "[helpers]") {
     std::string signed_input = "-912";
     int x_signed;
-    EXPECT_TRUE(CLI::detail::lexical_cast(signed_input, x_signed));
-    EXPECT_EQ(-912, x_signed);
+    CHECK(CLI::detail::lexical_cast(signed_input, x_signed));
+    CHECK(x_signed == -912);
 
     std::string unsigned_input = "912";
     unsigned int x_unsigned;
-    EXPECT_TRUE(CLI::detail::lexical_cast(unsigned_input, x_unsigned));
-    EXPECT_EQ((unsigned int)912, x_unsigned);
+    CHECK(CLI::detail::lexical_cast(unsigned_input, x_unsigned));
+    CHECK(x_unsigned == (unsigned int)912);
 
-    EXPECT_FALSE(CLI::detail::lexical_cast(signed_input, x_unsigned));
+    CHECK_FALSE(CLI::detail::lexical_cast(signed_input, x_unsigned));
 
     unsigned char y;
-    std::string overflow_input = std::to_string(UINT64_MAX) + "0";
-    EXPECT_FALSE(CLI::detail::lexical_cast(overflow_input, y));
+    std::string overflow_input = std::to_string(std::numeric_limits<uint64_t>::max()) + "0";
+    CHECK_FALSE(CLI::detail::lexical_cast(overflow_input, y));
 
     char y_signed;
-    EXPECT_FALSE(CLI::detail::lexical_cast(overflow_input, y_signed));
+    CHECK_FALSE(CLI::detail::lexical_cast(overflow_input, y_signed));
 
     std::string bad_input = "hello";
-    EXPECT_FALSE(CLI::detail::lexical_cast(bad_input, y));
+    CHECK_FALSE(CLI::detail::lexical_cast(bad_input, y));
 
     std::string extra_input = "912i";
-    EXPECT_FALSE(CLI::detail::lexical_cast(extra_input, y));
+    CHECK_FALSE(CLI::detail::lexical_cast(extra_input, y));
 
     std::string empty_input{};
-    EXPECT_FALSE(CLI::detail::lexical_cast(empty_input, x_signed));
-    EXPECT_FALSE(CLI::detail::lexical_cast(empty_input, x_unsigned));
-    EXPECT_FALSE(CLI::detail::lexical_cast(empty_input, y_signed));
+    CHECK_FALSE(CLI::detail::lexical_cast(empty_input, x_signed));
+    CHECK_FALSE(CLI::detail::lexical_cast(empty_input, x_unsigned));
+    CHECK_FALSE(CLI::detail::lexical_cast(empty_input, y_signed));
 }
 
-TEST(Types, LexicalCastDouble) {
+TEST_CASE("Types: LexicalCastDouble", "[helpers]") {
     std::string input = "9.12";
     long double x;
-    EXPECT_TRUE(CLI::detail::lexical_cast(input, x));
-    EXPECT_FLOAT_EQ((float)9.12, (float)x);
+    CHECK(CLI::detail::lexical_cast(input, x));
+    CHECK((float)x == Approx((float)9.12));
 
     std::string bad_input = "hello";
-    EXPECT_FALSE(CLI::detail::lexical_cast(bad_input, x));
+    CHECK_FALSE(CLI::detail::lexical_cast(bad_input, x));
 
-    std::string overflow_input = "1" + std::to_string(LDBL_MAX);
-    EXPECT_TRUE(CLI::detail::lexical_cast(overflow_input, x));
-    EXPECT_FALSE(std::isfinite(x));
+    std::string overflow_input = "1" + std::to_string(std::numeric_limits<long double>::max());
+    CHECK(CLI::detail::lexical_cast(overflow_input, x));
+    CHECK_FALSE(std::isfinite(x));
 
     std::string extra_input = "9.12i";
-    EXPECT_FALSE(CLI::detail::lexical_cast(extra_input, x));
+    CHECK_FALSE(CLI::detail::lexical_cast(extra_input, x));
 
     std::string empty_input{};
-    EXPECT_FALSE(CLI::detail::lexical_cast(empty_input, x));
+    CHECK_FALSE(CLI::detail::lexical_cast(empty_input, x));
 }
 
-TEST(Types, LexicalCastBool) {
+TEST_CASE("Types: LexicalCastBool", "[helpers]") {
     std::string input = "false";
     bool x;
-    EXPECT_TRUE(CLI::detail::lexical_cast(input, x));
-    EXPECT_FALSE(x);
+    CHECK(CLI::detail::lexical_cast(input, x));
+    CHECK_FALSE(x);
 
     std::string bad_input = "happy";
-    EXPECT_FALSE(CLI::detail::lexical_cast(bad_input, x));
+    CHECK_FALSE(CLI::detail::lexical_cast(bad_input, x));
 
     std::string input_true = "EnaBLE";
-    EXPECT_TRUE(CLI::detail::lexical_cast(input_true, x));
-    EXPECT_TRUE(x);
+    CHECK(CLI::detail::lexical_cast(input_true, x));
+    CHECK(x);
 }
 
-TEST(Types, LexicalCastString) {
+TEST_CASE("Types: LexicalCastString", "[helpers]") {
     std::string input = "one";
     std::string output;
     CLI::detail::lexical_cast(input, output);
-    EXPECT_EQ(input, output);
+    CHECK(output == input);
 }
 
-TEST(Types, LexicalCastParsable) {
+TEST_CASE("Types: LexicalCastParsable", "[helpers]") {
     std::string input = "(4.2,7.3)";
     std::string fail_input = "4.2,7.3";
     std::string extra_input = "(4.2,7.3)e";
 
     std::complex<double> output;
-    EXPECT_TRUE(CLI::detail::lexical_cast(input, output));
-    EXPECT_DOUBLE_EQ(output.real(), 4.2);  // Doing this in one go sometimes has trouble
-    EXPECT_DOUBLE_EQ(output.imag(), 7.3);  // on clang + gcc 4.8 due to missing const
+    CHECK(CLI::detail::lexical_cast(input, output));
+    CHECK(4.2 == Approx(output.real()));
+    CHECK(7.3 == Approx(output.imag()));
 
-    EXPECT_TRUE(CLI::detail::lexical_cast("2.456", output));
-    EXPECT_DOUBLE_EQ(output.real(), 2.456);  // Doing this in one go sometimes has trouble
-    EXPECT_DOUBLE_EQ(output.imag(), 0.0);    // on clang + gcc 4.8 due to missing const
+    CHECK(CLI::detail::lexical_cast("2.456", output));
+    CHECK(2.456 == Approx(output.real()));
+    CHECK(0.0 == Approx(output.imag()));
 
-    EXPECT_FALSE(CLI::detail::lexical_cast(fail_input, output));
-    EXPECT_FALSE(CLI::detail::lexical_cast(extra_input, output));
+    CHECK_FALSE(CLI::detail::lexical_cast(fail_input, output));
+    CHECK_FALSE(CLI::detail::lexical_cast(extra_input, output));
 }
 
-TEST(Types, LexicalCastEnum) {
+TEST_CASE("Types: LexicalCastEnum", "[helpers]") {
     enum t1 : signed char { v1 = 5, v3 = 7, v5 = -9 };
 
     t1 output;
-    EXPECT_TRUE(CLI::detail::lexical_cast("-9", output));
-    EXPECT_EQ(output, v5);
+    CHECK(CLI::detail::lexical_cast("-9", output));
+    CHECK(v5 == output);
 
-    EXPECT_FALSE(CLI::detail::lexical_cast("invalid", output));
+    CHECK_FALSE(CLI::detail::lexical_cast("invalid", output));
     enum class t2 : std::uint64_t { enum1 = 65, enum2 = 45667, enum3 = 9999999999999 };
     t2 output2{t2::enum2};
-    EXPECT_TRUE(CLI::detail::lexical_cast("65", output2));
-    EXPECT_EQ(output2, t2::enum1);
+    CHECK(CLI::detail::lexical_cast("65", output2));
+    CHECK(t2::enum1 == output2);
 
-    EXPECT_FALSE(CLI::detail::lexical_cast("invalid", output2));
+    CHECK_FALSE(CLI::detail::lexical_cast("invalid", output2));
 
-    EXPECT_TRUE(CLI::detail::lexical_cast("9999999999999", output2));
-    EXPECT_EQ(output2, t2::enum3);
+    CHECK(CLI::detail::lexical_cast("9999999999999", output2));
+    CHECK(t2::enum3 == output2);
 }
 
-TEST(Types, LexicalConversionDouble) {
+TEST_CASE("Types: LexicalConversionDouble", "[helpers]") {
     CLI::results_t input = {"9.12"};
     long double x{0.0};
     bool res = CLI::detail::lexical_conversion<long double, double>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_FLOAT_EQ((float)9.12, (float)x);
+    CHECK(res);
+    CHECK((float)x == Approx((float)9.12));
 
     CLI::results_t bad_input = {"hello"};
     res = CLI::detail::lexical_conversion<long double, double>(input, x);
-    EXPECT_TRUE(res);
+    CHECK(res);
 }
 
-TEST(Types, LexicalConversionDoubleTuple) {
+TEST_CASE("Types: LexicalConversionDoubleTuple", "[helpers]") {
     CLI::results_t input = {"9.12"};
     std::tuple<double> x{0.0};
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_DOUBLE_EQ(9.12, std::get<0>(x));
+    CHECK(res);
+    CHECK(std::get<0>(x) == Approx(9.12));
 
     CLI::results_t bad_input = {"hello"};
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
+    CHECK(res);
 }
 
-TEST(Types, LexicalConversionVectorDouble) {
+TEST_CASE("Types: LexicalConversionVectorDouble", "[helpers]") {
     CLI::results_t input = {"9.12", "10.79", "-3.54"};
     std::vector<double> x;
     bool res = CLI::detail::lexical_conversion<std::vector<double>, double>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(x.size(), 3u);
-    EXPECT_DOUBLE_EQ(x[2], -3.54);
+    CHECK(res);
+    CHECK(3u == x.size());
+    CHECK(-3.54 == Approx(x[2]));
 
     res = CLI::detail::lexical_conversion<std::vector<double>, std::vector<double>>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(x.size(), 3u);
-    EXPECT_DOUBLE_EQ(x[2], -3.54);
+    CHECK(res);
+    CHECK(3u == x.size());
+    CHECK(-3.54 == Approx(x[2]));
 }
 
 static_assert(!CLI::detail::is_tuple_like<std::vector<double>>::value, "vector should not be like a tuple");
@@ -1162,107 +1166,107 @@ static_assert(!CLI::detail::is_tuple_like<std::string>::value, "std::string shou
 static_assert(!CLI::detail::is_tuple_like<double>::value, "double should not be like a tuple");
 static_assert(CLI::detail::is_tuple_like<std::tuple<double, int, double>>::value, "tuple should look like a tuple");
 
-TEST(Types, LexicalConversionTuple2) {
+TEST_CASE("Types: LexicalConversionTuple2", "[helpers]") {
     CLI::results_t input = {"9.12", "19"};
 
     std::tuple<double, int> x{0.0, 0};
     static_assert(CLI::detail::is_tuple_like<decltype(x)>::value,
                   "tuple type must have is_tuple_like trait to be true");
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(std::get<1>(x), 19);
-    EXPECT_DOUBLE_EQ(std::get<0>(x), 9.12);
+    CHECK(res);
+    CHECK(19 == std::get<1>(x));
+    CHECK(9.12 == Approx(std::get<0>(x)));
 
     input = {"19", "9.12"};
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionTuple3) {
+TEST_CASE("Types: LexicalConversionTuple3", "[helpers]") {
     CLI::results_t input = {"9.12", "19", "hippo"};
     std::tuple<double, int, std::string> x;
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(std::get<1>(x), 19);
-    EXPECT_DOUBLE_EQ(std::get<0>(x), 9.12);
-    EXPECT_EQ(std::get<2>(x), "hippo");
+    CHECK(res);
+    CHECK(19 == std::get<1>(x));
+    CHECK(9.12 == Approx(std::get<0>(x)));
+    CHECK("hippo" == std::get<2>(x));
 
     input = {"19", "9.12"};
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionTuple4) {
+TEST_CASE("Types: LexicalConversionTuple4", "[helpers]") {
     CLI::results_t input = {"9.12", "19", "18.6", "5.87"};
     std::array<double, 4> x;
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_DOUBLE_EQ(std::get<1>(x), 19);
-    EXPECT_DOUBLE_EQ(x[0], 9.12);
-    EXPECT_DOUBLE_EQ(x[2], 18.6);
-    EXPECT_DOUBLE_EQ(x[3], 5.87);
+    CHECK(res);
+    CHECK(19 == Approx(std::get<1>(x)));
+    CHECK(9.12 == Approx(x[0]));
+    CHECK(18.6 == Approx(x[2]));
+    CHECK(5.87 == Approx(x[3]));
 
     input = {"19", "9.12", "hippo"};
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionTuple5) {
+TEST_CASE("Types: LexicalConversionTuple5", "[helpers]") {
     CLI::results_t input = {"9", "19", "18", "5", "235235"};
     std::array<unsigned int, 5> x;
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(std::get<1>(x), 19u);
-    EXPECT_EQ(x[0], 9u);
-    EXPECT_EQ(x[2], 18u);
-    EXPECT_EQ(x[3], 5u);
-    EXPECT_EQ(x[4], 235235u);
+    CHECK(res);
+    CHECK(19u == std::get<1>(x));
+    CHECK(9u == x[0]);
+    CHECK(18u == x[2]);
+    CHECK(5u == x[3]);
+    CHECK(235235u == x[4]);
 
     input = {"19", "9.12", "hippo"};
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionTuple10) {
+TEST_CASE("Types: LexicalConversionTuple10", "[helpers]") {
     CLI::results_t input = {"9", "19", "18", "5", "235235", "9", "19", "18", "5", "235235"};
     std::array<unsigned int, 10> x;
     bool res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(std::get<1>(x), 19u);
-    EXPECT_EQ(x[0], 9u);
-    EXPECT_EQ(x[2], 18u);
-    EXPECT_EQ(x[3], 5u);
-    EXPECT_EQ(x[4], 235235u);
-    EXPECT_EQ(x[9], 235235u);
+    CHECK(res);
+    CHECK(19u == std::get<1>(x));
+    CHECK(9u == x[0]);
+    CHECK(18u == x[2]);
+    CHECK(5u == x[3]);
+    CHECK(235235u == x[4]);
+    CHECK(235235u == x[9]);
     input[3] = "hippo";
     res = CLI::detail::lexical_conversion<decltype(x), decltype(x)>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionTuple10XC) {
+TEST_CASE("Types: LexicalConversionTuple10XC", "[helpers]") {
     CLI::results_t input = {"9", "19", "18", "5", "235235", "9", "19", "18", "5", "235235"};
     std::array<double, 10> x;
     bool res = CLI::detail::lexical_conversion<decltype(x), std::array<unsigned int, 10>>(input, x);
 
-    EXPECT_TRUE(res);
-    EXPECT_EQ(std::get<1>(x), 19.0);
-    EXPECT_EQ(x[0], 9.0);
-    EXPECT_EQ(x[2], 18.0);
-    EXPECT_EQ(x[3], 5.0);
-    EXPECT_EQ(x[4], 235235.0);
-    EXPECT_EQ(x[9], 235235.0);
+    CHECK(res);
+    CHECK(19.0 == std::get<1>(x));
+    CHECK(9.0 == x[0]);
+    CHECK(18.0 == x[2]);
+    CHECK(5.0 == x[3]);
+    CHECK(235235.0 == x[4]);
+    CHECK(235235.0 == x[9]);
     input[3] = "19.7";
     res = CLI::detail::lexical_conversion<decltype(x), std::array<unsigned int, 10>>(input, x);
-    EXPECT_FALSE(res);
+    CHECK_FALSE(res);
 }
 
-TEST(Types, LexicalConversionComplex) {
+TEST_CASE("Types: LexicalConversionComplex", "[helpers]") {
     CLI::results_t input = {"5.1", "3.5"};
     std::complex<double> x;
     bool res = CLI::detail::lexical_conversion<std::complex<double>, std::array<double, 2>>(input, x);
-    EXPECT_TRUE(res);
-    EXPECT_EQ(x.real(), 5.1);
-    EXPECT_EQ(x.imag(), 3.5);
+    CHECK(res);
+    CHECK(5.1 == x.real());
+    CHECK(3.5 == x.imag());
 }
 
 static_assert(CLI::detail::is_wrapper<std::vector<double>>::value, "vector double should be a wrapper");
@@ -1285,16 +1289,16 @@ static_assert(CLI::detail::is_readable_container<const std::vector<int>>::value,
 static_assert(CLI::detail::is_readable_container<const std::vector<int> &>::value,
               "const vector int & should be a readable container");
 
-TEST(FixNewLines, BasicCheck) {
+TEST_CASE("FixNewLines: BasicCheck", "[helpers]") {
     std::string input = "one\ntwo";
     std::string output = "one\n; two";
     std::string result = CLI::detail::fix_newlines("; ", input);
-    EXPECT_EQ(result, output);
+    CHECK(output == result);
 }
 
-TEST(FixNewLines, EdgesCheck) {
+TEST_CASE("FixNewLines: EdgesCheck", "[helpers]") {
     std::string input = "\none\ntwo\n";
     std::string output = "\n; one\n; two\n; ";
     std::string result = CLI::detail::fix_newlines("; ", input);
-    EXPECT_EQ(result, output);
+    CHECK(output == result);
 }
diff --git a/packages/CLI11/tests/NewParseTest.cpp b/packages/CLI11/tests/NewParseTest.cpp
index cf0adc384726c3b51fbdcfb0244ff105914701c0..d9d9dbf91ca777d8c70fa2f289316e535acec0f0 100644
--- a/packages/CLI11/tests/NewParseTest.cpp
+++ b/packages/CLI11/tests/NewParseTest.cpp
@@ -5,159 +5,159 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "app_helper.hpp"
-#include "gmock/gmock.h"
+
 #include <complex>
 #include <cstdint>
 
-using ::testing::HasSubstr;
+using Catch::Matchers::Contains;
 
 using cx = std::complex<double>;
 
-TEST_F(TApp, Complex) {
+TEST_CASE_METHOD(TApp, "Complex", "[newparse]") {
     cx comp{1, 2};
     app.add_complex("-c,--complex", comp, "", true);
 
     args = {"-c", "4", "3"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_DOUBLE_EQ(1, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexOption) {
+TEST_CASE_METHOD(TApp, "ComplexOption", "[newparse]") {
     cx comp{1, 2};
     app.add_option("-c,--complex", comp, "", true);
 
     args = {"-c", "4", "3"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_DOUBLE_EQ(1, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexFloat) {
+TEST_CASE_METHOD(TApp, "ComplexFloat", "[newparse]") {
     std::complex<float> comp{1, 2};
     app.add_complex<std::complex<float>, float>("-c,--complex", comp, "", true);
 
     args = {"-c", "4", "3"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_FLOAT_EQ(1, comp.real());
-    EXPECT_FLOAT_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_FLOAT_EQ(4, comp.real());
-    EXPECT_FLOAT_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexFloatOption) {
+TEST_CASE_METHOD(TApp, "ComplexFloatOption", "[newparse]") {
     std::complex<float> comp{1, 2};
     app.add_option("-c,--complex", comp, "", true);
 
     args = {"-c", "4", "3"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_FLOAT_EQ(1, comp.real());
-    EXPECT_FLOAT_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_FLOAT_EQ(4, comp.real());
-    EXPECT_FLOAT_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexWithDelimiter) {
+TEST_CASE_METHOD(TApp, "ComplexWithDelimiter", "[newparse]") {
     cx comp{1, 2};
     app.add_complex("-c,--complex", comp, "", true)->delimiter('+');
 
     args = {"-c", "4+3i"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_DOUBLE_EQ(1, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 
     args = {"-c", "5+-3i"};
     run();
 
-    EXPECT_DOUBLE_EQ(5, comp.real());
-    EXPECT_DOUBLE_EQ(-3, comp.imag());
+    CHECK(comp.real() == Approx(5));
+    CHECK(comp.imag() == Approx(-3));
 
     args = {"-c", "6", "-4i"};
     run();
 
-    EXPECT_DOUBLE_EQ(6, comp.real());
-    EXPECT_DOUBLE_EQ(-4, comp.imag());
+    CHECK(comp.real() == Approx(6));
+    CHECK(comp.imag() == Approx(-4));
 }
 
-TEST_F(TApp, ComplexWithDelimiterOption) {
+TEST_CASE_METHOD(TApp, "ComplexWithDelimiterOption", "[newparse]") {
     cx comp{1, 2};
     app.add_option("-c,--complex", comp, "", true)->delimiter('+');
 
     args = {"-c", "4+3i"};
 
     std::string help = app.help();
-    EXPECT_THAT(help, HasSubstr("1"));
-    EXPECT_THAT(help, HasSubstr("2"));
-    EXPECT_THAT(help, HasSubstr("COMPLEX"));
+    CHECK_THAT(help, Contains("1"));
+    CHECK_THAT(help, Contains("2"));
+    CHECK_THAT(help, Contains("COMPLEX"));
 
-    EXPECT_DOUBLE_EQ(1, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(1));
+    CHECK(comp.imag() == Approx(2));
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 
     args = {"-c", "5+-3i"};
     run();
 
-    EXPECT_DOUBLE_EQ(5, comp.real());
-    EXPECT_DOUBLE_EQ(-3, comp.imag());
+    CHECK(comp.real() == Approx(5));
+    CHECK(comp.imag() == Approx(-3));
 
     args = {"-c", "6", "-4i"};
     run();
 
-    EXPECT_DOUBLE_EQ(6, comp.real());
-    EXPECT_DOUBLE_EQ(-4, comp.imag());
+    CHECK(comp.real() == Approx(6));
+    CHECK(comp.imag() == Approx(-4));
 }
 
-TEST_F(TApp, ComplexIgnoreI) {
+TEST_CASE_METHOD(TApp, "ComplexIgnoreI", "[newparse]") {
     cx comp{1, 2};
     app.add_complex("-c,--complex", comp);
 
@@ -165,11 +165,11 @@ TEST_F(TApp, ComplexIgnoreI) {
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexIgnoreIOption) {
+TEST_CASE_METHOD(TApp, "ComplexIgnoreIOption", "[newparse]") {
     cx comp{1, 2};
     app.add_option("-c,--complex", comp);
 
@@ -177,122 +177,122 @@ TEST_F(TApp, ComplexIgnoreIOption) {
 
     run();
 
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(3, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(3));
 }
 
-TEST_F(TApp, ComplexSingleArg) {
+TEST_CASE_METHOD(TApp, "ComplexSingleArg", "[newparse]") {
     cx comp{1, 2};
     app.add_complex("-c,--complex", comp);
 
     args = {"-c", "4"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(0));
 
     args = {"-c", "4-2i"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(-2, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(-2));
     args = {"-c", "4+2i"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(2));
 
     args = {"-c", "-4+2j"};
     run();
-    EXPECT_DOUBLE_EQ(-4, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(-4));
+    CHECK(comp.imag() == Approx(2));
 
     args = {"-c", "-4.2-2j"};
     run();
-    EXPECT_DOUBLE_EQ(-4.2, comp.real());
-    EXPECT_DOUBLE_EQ(-2, comp.imag());
+    CHECK(comp.real() == Approx(-4.2));
+    CHECK(comp.imag() == Approx(-2));
 
     args = {"-c", "-4.2-2.7i"};
     run();
-    EXPECT_DOUBLE_EQ(-4.2, comp.real());
-    EXPECT_DOUBLE_EQ(-2.7, comp.imag());
+    CHECK(comp.real() == Approx(-4.2));
+    CHECK(comp.imag() == Approx(-2.7));
 }
 
-TEST_F(TApp, ComplexSingleArgOption) {
+TEST_CASE_METHOD(TApp, "ComplexSingleArgOption", "[newparse]") {
     cx comp{1, 2};
     app.add_option("-c,--complex", comp);
 
     args = {"-c", "4"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(0));
 
     args = {"-c", "4-2i"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(-2, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(-2));
     args = {"-c", "4+2i"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(2));
 
     args = {"-c", "-4+2j"};
     run();
-    EXPECT_DOUBLE_EQ(-4, comp.real());
-    EXPECT_DOUBLE_EQ(2, comp.imag());
+    CHECK(comp.real() == Approx(-4));
+    CHECK(comp.imag() == Approx(2));
 
     args = {"-c", "-4.2-2j"};
     run();
-    EXPECT_DOUBLE_EQ(-4.2, comp.real());
-    EXPECT_DOUBLE_EQ(-2, comp.imag());
+    CHECK(comp.real() == Approx(-4.2));
+    CHECK(comp.imag() == Approx(-2));
 
     args = {"-c", "-4.2-2.7i"};
     run();
-    EXPECT_DOUBLE_EQ(-4.2, comp.real());
-    EXPECT_DOUBLE_EQ(-2.7, comp.imag());
+    CHECK(comp.real() == Approx(-4.2));
+    CHECK(comp.imag() == Approx(-2.7));
 }
 
-TEST_F(TApp, ComplexSingleImag) {
+TEST_CASE_METHOD(TApp, "ComplexSingleImag", "[newparse]") {
     cx comp{1, 2};
     app.add_complex("-c,--complex", comp);
 
     args = {"-c", "4j"};
     run();
-    EXPECT_DOUBLE_EQ(0, comp.real());
-    EXPECT_DOUBLE_EQ(4, comp.imag());
+    CHECK(comp.real() == Approx(0));
+    CHECK(comp.imag() == Approx(4));
 
     args = {"-c", "-4j"};
     run();
-    EXPECT_DOUBLE_EQ(0, comp.real());
-    EXPECT_DOUBLE_EQ(-4, comp.imag());
+    CHECK(comp.real() == Approx(0));
+    CHECK(comp.imag() == Approx(-4));
     args = {"-c", "-4"};
     run();
-    EXPECT_DOUBLE_EQ(-4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(-4));
+    CHECK(comp.imag() == Approx(0));
     args = {"-c", "+4"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(0));
 }
 
-TEST_F(TApp, ComplexSingleImagOption) {
+TEST_CASE_METHOD(TApp, "ComplexSingleImagOption", "[newparse]") {
     cx comp{1, 2};
     app.add_option("-c,--complex", comp);
 
     args = {"-c", "4j"};
     run();
-    EXPECT_DOUBLE_EQ(0, comp.real());
-    EXPECT_DOUBLE_EQ(4, comp.imag());
+    CHECK(comp.real() == Approx(0));
+    CHECK(comp.imag() == Approx(4));
 
     args = {"-c", "-4j"};
     run();
-    EXPECT_DOUBLE_EQ(0, comp.real());
-    EXPECT_DOUBLE_EQ(-4, comp.imag());
+    CHECK(comp.real() == Approx(0));
+    CHECK(comp.imag() == Approx(-4));
     args = {"-c", "-4"};
     run();
-    EXPECT_DOUBLE_EQ(-4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(-4));
+    CHECK(comp.imag() == Approx(0));
     args = {"-c", "+4"};
     run();
-    EXPECT_DOUBLE_EQ(4, comp.real());
-    EXPECT_DOUBLE_EQ(0, comp.imag());
+    CHECK(comp.real() == Approx(4));
+    CHECK(comp.imag() == Approx(0));
 }
 
 /// Simple class containing two strings useful for testing lexical cast and conversions
@@ -321,24 +321,24 @@ template <> bool lexical_cast<spair>(const std::string &input, spair &output) {
 }  // namespace detail
 }  // namespace CLI
 
-TEST_F(TApp, custom_string_converter) {
+TEST_CASE_METHOD(TApp, "custom_string_converter", "[newparse]") {
     spair val;
     app.add_option("-d,--dual_string", val);
 
     args = {"-d", "string1:string2"};
 
     run();
-    EXPECT_EQ(val.first, "string1");
-    EXPECT_EQ(val.second, "string2");
+    CHECK("string1" == val.first);
+    CHECK("string2" == val.second);
 }
 
-TEST_F(TApp, custom_string_converterFail) {
+TEST_CASE_METHOD(TApp, "custom_string_converterFail", "[newparse]") {
     spair val;
     app.add_option("-d,--dual_string", val);
 
     args = {"-d", "string2"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 /// simple class to wrap another  with a very specific type constructor and assignment operators to test out some of the
@@ -369,14 +369,14 @@ static_assert(CLI::detail::is_direct_constructible<objWrapper<std::string>, std:
 
 static_assert(!std::is_assignable<objWrapper<std::string>, std::string>::value,
               "string wrapper is improperly assignable");
-TEST_F(TApp, stringWrapper) {
+TEST_CASE_METHOD(TApp, "stringWrapper", "[newparse]") {
     objWrapper<std::string> sWrapper;
     app.add_option("-v", sWrapper);
     args = {"-v", "string test"};
 
     run();
 
-    EXPECT_EQ(sWrapper.value(), "string test");
+    CHECK("string test" == sWrapper.value());
 }
 
 static_assert(CLI::detail::is_direct_constructible<objWrapper<double>, double>::value,
@@ -388,18 +388,18 @@ static_assert(!CLI::detail::is_direct_constructible<objWrapper<double>, int>::va
 static_assert(!CLI::detail::is_istreamable<objWrapper<double>>::value,
               "double wrapper is input streamable and it shouldn't be");
 
-TEST_F(TApp, doubleWrapper) {
+TEST_CASE_METHOD(TApp, "doubleWrapper", "[newparse]") {
     objWrapper<double> dWrapper;
     app.add_option("-v", dWrapper);
     args = {"-v", "2.36"};
 
     run();
 
-    EXPECT_EQ(dWrapper.value(), 2.36);
+    CHECK(2.36 == dWrapper.value());
 
     args = {"-v", "thing"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 static_assert(CLI::detail::is_direct_constructible<objWrapper<int>, int>::value,
@@ -411,17 +411,17 @@ static_assert(!CLI::detail::is_direct_constructible<objWrapper<int>, double>::va
 static_assert(!CLI::detail::is_istreamable<objWrapper<int>>::value,
               "int wrapper is input streamable and it shouldn't be");
 
-TEST_F(TApp, intWrapper) {
+TEST_CASE_METHOD(TApp, "intWrapper", "[newparse]") {
     objWrapper<int> iWrapper;
     app.add_option("-v", iWrapper);
     args = {"-v", "45"};
 
     run();
 
-    EXPECT_EQ(iWrapper.value(), 45);
+    CHECK(45 == iWrapper.value());
     args = {"-v", "thing"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 static_assert(!CLI::detail::is_direct_constructible<objWrapper<float>, int>::value,
@@ -432,17 +432,17 @@ static_assert(!CLI::detail::is_direct_constructible<objWrapper<float>, double>::
 static_assert(!CLI::detail::is_istreamable<objWrapper<float>>::value,
               "float wrapper is input streamable and it shouldn't be");
 
-TEST_F(TApp, floatWrapper) {
+TEST_CASE_METHOD(TApp, "floatWrapper", "[newparse]") {
     objWrapper<float> iWrapper;
     app.add_option<objWrapper<float>, float>("-v", iWrapper);
     args = {"-v", "45.3"};
 
     run();
 
-    EXPECT_EQ(iWrapper.value(), 45.3f);
+    CHECK(45.3f == iWrapper.value());
     args = {"-v", "thing"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 #endif
@@ -461,26 +461,26 @@ class dobjWrapper {
     int ival_{0};
 };
 
-TEST_F(TApp, dobjWrapper) {
+TEST_CASE_METHOD(TApp, "dobjWrapper", "[newparse]") {
     dobjWrapper iWrapper;
     app.add_option("-v", iWrapper);
     args = {"-v", "45"};
 
     run();
 
-    EXPECT_EQ(iWrapper.ivalue(), 45);
-    EXPECT_EQ(iWrapper.dvalue(), 0.0);
+    CHECK(45 == iWrapper.ivalue());
+    CHECK(0.0 == iWrapper.dvalue());
 
     args = {"-v", "thing"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
     iWrapper = dobjWrapper{};
 
     args = {"-v", "45.1"};
 
     run();
-    EXPECT_EQ(iWrapper.ivalue(), 0);
-    EXPECT_EQ(iWrapper.dvalue(), 45.1);
+    CHECK(0 == iWrapper.ivalue());
+    CHECK(45.1 == iWrapper.dvalue());
 }
 
 /// simple class to wrap another  with a very specific type constructor and assignment operators to test out some of the
@@ -507,25 +507,25 @@ template <class X> class AobjWrapper {
 static_assert(std::is_assignable<AobjWrapper<std::uint16_t> &, std::uint16_t>::value,
               "AobjWrapper not assignable like it should be ");
 
-TEST_F(TApp, uint16Wrapper) {
+TEST_CASE_METHOD(TApp, "uint16Wrapper", "[newparse]") {
     AobjWrapper<std::uint16_t> sWrapper;
     app.add_option<AobjWrapper<std::uint16_t>, std::uint16_t>("-v", sWrapper);
     args = {"-v", "9"};
 
     run();
 
-    EXPECT_EQ(sWrapper.value(), 9u);
+    CHECK(9u == sWrapper.value());
     args = {"-v", "thing"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     args = {"-v", "72456245754"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     args = {"-v", "-3"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 template <class T> class SimpleWrapper {
@@ -539,25 +539,25 @@ template <class T> class SimpleWrapper {
     T val_;
 };
 
-TEST_F(TApp, wrapperInt) {
+TEST_CASE_METHOD(TApp, "wrapperInt", "[newparse]") {
     SimpleWrapper<int> wrap;
     app.add_option("--val", wrap);
     args = {"--val", "2"};
 
     run();
-    EXPECT_EQ(wrap.getRef(), 2);
+    CHECK(2 == wrap.getRef());
 }
 
-TEST_F(TApp, wrapperString) {
+TEST_CASE_METHOD(TApp, "wrapperString", "[newparse]") {
     SimpleWrapper<std::string> wrap;
     app.add_option("--val", wrap);
     args = {"--val", "str"};
 
     run();
-    EXPECT_EQ(wrap.getRef(), "str");
+    CHECK("str" == wrap.getRef());
 }
 
-TEST_F(TApp, wrapperVector) {
+TEST_CASE_METHOD(TApp, "wrapperVector", "[newparse]") {
     SimpleWrapper<std::vector<int>> wrap;
     app.add_option("--val", wrap);
     args = {"--val", "1", "2", "3", "4"};
@@ -565,10 +565,10 @@ TEST_F(TApp, wrapperVector) {
     run();
     auto v1 = wrap.getRef();
     auto v2 = std::vector<int>{1, 2, 3, 4};
-    EXPECT_EQ(v1, v2);
+    CHECK(v2 == v1);
 }
 
-TEST_F(TApp, wrapperwrapperString) {
+TEST_CASE_METHOD(TApp, "wrapperwrapperString", "[newparse]") {
     SimpleWrapper<SimpleWrapper<std::string>> wrap;
     app.add_option("--val", wrap);
     args = {"--val", "arg"};
@@ -576,10 +576,10 @@ TEST_F(TApp, wrapperwrapperString) {
     run();
     auto v1 = wrap.getRef().getRef();
     auto v2 = "arg";
-    EXPECT_EQ(v1, v2);
+    CHECK(v2 == v1);
 }
 
-TEST_F(TApp, wrapperwrapperVector) {
+TEST_CASE_METHOD(TApp, "wrapperwrapperVector", "[newparse]") {
     SimpleWrapper<SimpleWrapper<std::vector<int>>> wrap;
     auto opt = app.add_option("--val", wrap);
     args = {"--val", "1", "2", "3", "4"};
@@ -587,20 +587,20 @@ TEST_F(TApp, wrapperwrapperVector) {
     run();
     auto v1 = wrap.getRef().getRef();
     auto v2 = std::vector<int>{1, 2, 3, 4};
-    EXPECT_EQ(v1, v2);
+    CHECK(v2 == v1);
     opt->type_size(0, 5);
 
     args = {"--val"};
 
     run();
-    EXPECT_TRUE(wrap.getRef().getRef().empty());
+    CHECK(wrap.getRef().getRef().empty());
 
     args = {"--val", "happy", "sad"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, wrapperComplex) {
+TEST_CASE_METHOD(TApp, "wrapperComplex", "[newparse]") {
     SimpleWrapper<std::complex<double>> wrap;
     app.add_option("--val", wrap);
     args = {"--val", "1", "2"};
@@ -608,26 +608,26 @@ TEST_F(TApp, wrapperComplex) {
     run();
     auto &v1 = wrap.getRef();
     auto v2 = std::complex<double>{1, 2};
-    EXPECT_EQ(v1.real(), v2.real());
-    EXPECT_EQ(v1.imag(), v2.imag());
+    CHECK(v2.real() == v1.real());
+    CHECK(v2.imag() == v1.imag());
     args = {"--val", "1.4-4j"};
 
     run();
     v2 = std::complex<double>{1.4, -4};
-    EXPECT_EQ(v1.real(), v2.real());
-    EXPECT_EQ(v1.imag(), v2.imag());
+    CHECK(v2.real() == v1.real());
+    CHECK(v2.imag() == v1.imag());
 }
 
-TEST_F(TApp, vectorComplex) {
+TEST_CASE_METHOD(TApp, "vectorComplex", "[newparse]") {
     std::vector<std::complex<double>> vcomplex;
     app.add_option("--val", vcomplex);
     args = {"--val", "1", "2", "--val", "1.4-4j"};
 
     run();
 
-    ASSERT_EQ(vcomplex.size(), 2U);
-    EXPECT_EQ(vcomplex[0].real(), 1.0);
-    EXPECT_EQ(vcomplex[0].imag(), 2.0);
-    EXPECT_EQ(vcomplex[1].real(), 1.4);
-    EXPECT_EQ(vcomplex[1].imag(), -4.0);
+    REQUIRE(2U == vcomplex.size());
+    CHECK(1.0 == vcomplex[0].real());
+    CHECK(2.0 == vcomplex[0].imag());
+    CHECK(1.4 == vcomplex[1].real());
+    CHECK(-4.0 == vcomplex[1].imag());
 }
diff --git a/packages/CLI11/tests/OptionGroupTest.cpp b/packages/CLI11/tests/OptionGroupTest.cpp
index e510c1a390847b0494e60f7608a9b6a85ab073bd..175f31dc3daf5de4da456c7a304a235d31aec42e 100644
--- a/packages/CLI11/tests/OptionGroupTest.cpp
+++ b/packages/CLI11/tests/OptionGroupTest.cpp
@@ -6,15 +6,11 @@
 
 #include "app_helper.hpp"
 
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using ::testing::HasSubstr;
-using ::testing::Not;
+using Catch::Matchers::Contains;
 
 using vs_t = std::vector<std::string>;
 
-TEST_F(TApp, BasicOptionGroup) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroup", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res;
     ogroup->add_option("--test1", res);
@@ -23,11 +19,11 @@ TEST_F(TApp, BasicOptionGroup) {
 
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
-    EXPECT_EQ(app.count_all(), 1u);
+    CHECK(5 == res);
+    CHECK(1u == app.count_all());
 }
 
-TEST_F(TApp, BasicOptionGroupExact) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupExact", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -38,20 +34,20 @@ TEST_F(TApp, BasicOptionGroupExact) {
     ogroup->require_option(1);
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
+    CHECK(5 == res);
 
     args = {"--test1", "5", "--test2", "4"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[Exactly 1");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupExactTooMany) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupExactTooMany", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -61,10 +57,10 @@ TEST_F(TApp, BasicOptionGroupExactTooMany) {
     app.add_option("--option", val2);
     ogroup->require_option(10);
     args = {"--test1", "5"};
-    EXPECT_THROW(run(), CLI::InvalidError);
+    CHECK_THROWS_AS(run(), CLI::InvalidError);
 }
 
-TEST_F(TApp, BasicOptionGroupMinMax) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMinMax", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -75,20 +71,20 @@ TEST_F(TApp, BasicOptionGroupMinMax) {
     ogroup->require_option(1, 1);
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
+    CHECK(5 == res);
 
     args = {"--test1", "5", "--test2", "4"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[Exactly 1");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMinMaxDifferent) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMinMaxDifferent", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -99,24 +95,24 @@ TEST_F(TApp, BasicOptionGroupMinMaxDifferent) {
     ogroup->require_option(1, 2);
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
+    CHECK(5 == res);
 
     args = {"--test1", "5", "--test2", "4"};
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(app.count_all(), 2u);
+    CHECK_NOTHROW(run());
+    CHECK(2u == app.count_all());
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[Between 1 and 2");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMinMaxDifferentReversed) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMinMaxDifferentReversed", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -125,30 +121,30 @@ TEST_F(TApp, BasicOptionGroupMinMaxDifferentReversed) {
     int val2{0};
     app.add_option("--option", val2);
     ogroup->require_option(2, 1);
-    EXPECT_EQ(ogroup->get_require_option_min(), 2u);
-    EXPECT_EQ(ogroup->get_require_option_max(), 1u);
+    CHECK(2u == ogroup->get_require_option_min());
+    CHECK(1u == ogroup->get_require_option_max());
     args = {"--test1", "5"};
-    EXPECT_THROW(run(), CLI::InvalidError);
+    CHECK_THROWS_AS(run(), CLI::InvalidError);
     ogroup->require_option(1, 2);
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(res, 5);
-    EXPECT_EQ(ogroup->get_require_option_min(), 1u);
-    EXPECT_EQ(ogroup->get_require_option_max(), 2u);
+    CHECK_NOTHROW(run());
+    CHECK(5 == res);
+    CHECK(1u == ogroup->get_require_option_min());
+    CHECK(2u == ogroup->get_require_option_max());
     args = {"--test1", "5", "--test2", "4"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[Between 1 and 2");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMax) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMax", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -159,20 +155,20 @@ TEST_F(TApp, BasicOptionGroupMax) {
     ogroup->require_option(-2);
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
+    CHECK(5 == res);
 
     args = {"--option", "9"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[At most 2");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMax1) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMax1", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -183,20 +179,20 @@ TEST_F(TApp, BasicOptionGroupMax1) {
     ogroup->require_option(-1);
     args = {"--test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
+    CHECK(5 == res);
 
     args = {"--option", "9"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"--test1", "5", "--test2", "4"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[At most 1");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMin) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMin", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -207,17 +203,17 @@ TEST_F(TApp, BasicOptionGroupMin) {
     ogroup->require_option();
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[At least 1");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupExact2) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupExact2", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -228,20 +224,20 @@ TEST_F(TApp, BasicOptionGroupExact2) {
     ogroup->require_option(2);
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test3=5"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[Exactly 2");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMin2) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMin2", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     int res{0};
     ogroup->add_option("--test1", res);
@@ -252,17 +248,17 @@ TEST_F(TApp, BasicOptionGroupMin2) {
     ogroup->require_option(2, 0);
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     std::string help = ogroup->help();
     auto exactloc = help.find("[At least 2");
-    EXPECT_NE(exactloc, std::string::npos);
+    CHECK(std::string::npos != exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMinMoved) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMinMoved", "[optiongroup]") {
 
     int res{0};
     auto opt1 = app.add_option("--test1", res);
@@ -278,20 +274,20 @@ TEST_F(TApp, BasicOptionGroupMinMoved) {
     ogroup->add_option(opt3);
 
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     std::string help = app.help();
     auto exactloc = help.find("[At least 1");
     auto oloc = help.find("--test1");
-    EXPECT_NE(exactloc, std::string::npos);
-    EXPECT_NE(oloc, std::string::npos);
-    EXPECT_LT(exactloc, oloc);
+    CHECK(std::string::npos != exactloc);
+    CHECK(std::string::npos != oloc);
+    CHECK(oloc > exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupMinMovedAsGroup) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupMinMovedAsGroup", "[optiongroup]") {
 
     int res{0};
     auto opt1 = app.add_option("--test1", res);
@@ -304,22 +300,22 @@ TEST_F(TApp, BasicOptionGroupMinMovedAsGroup) {
     ogroup->require_option();
     ogroup->add_options(opt1, opt2, opt3);
 
-    EXPECT_THROW(ogroup->add_options(opt1), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->add_options(opt1), CLI::OptionNotFound);
     args = {"--option", "9"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--test1", "5", "--test2", "4", "--test3=5"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     std::string help = app.help();
     auto exactloc = help.find("[At least 1");
     auto oloc = help.find("--test1");
-    EXPECT_NE(exactloc, std::string::npos);
-    EXPECT_NE(oloc, std::string::npos);
-    EXPECT_LT(exactloc, oloc);
+    CHECK(std::string::npos != exactloc);
+    CHECK(std::string::npos != oloc);
+    CHECK(oloc > exactloc);
 }
 
-TEST_F(TApp, BasicOptionGroupAddFailures) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupAddFailures", "[optiongroup]") {
 
     int res{0};
     auto opt1 = app.add_option("--test1", res);
@@ -328,24 +324,24 @@ TEST_F(TApp, BasicOptionGroupAddFailures) {
     app.add_option("--option", val2);
 
     auto ogroup = app.add_option_group("clusters");
-    EXPECT_THROW(ogroup->add_options(app.get_config_ptr()), CLI::OptionAlreadyAdded);
-    EXPECT_THROW(ogroup->add_options(app.get_help_ptr()), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(ogroup->add_options(app.get_config_ptr()), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(ogroup->add_options(app.get_help_ptr()), CLI::OptionAlreadyAdded);
 
     auto sub = app.add_subcommand("sub", "subcommand");
     auto opt2 = sub->add_option("--option2", val2);
 
-    EXPECT_THROW(ogroup->add_option(opt2), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->add_option(opt2), CLI::OptionNotFound);
 
-    EXPECT_THROW(ogroup->add_options(nullptr), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->add_options(nullptr), CLI::OptionNotFound);
 
     ogroup->add_option(opt1);
 
     auto opt3 = app.add_option("--test1", res);
 
-    EXPECT_THROW(ogroup->add_option(opt3), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(ogroup->add_option(opt3), CLI::OptionAlreadyAdded);
 }
 
-TEST_F(TApp, BasicOptionGroupScrewedUpMove) {
+TEST_CASE_METHOD(TApp, "BasicOptionGroupScrewedUpMove", "[optiongroup]") {
 
     int res{0};
     auto opt1 = app.add_option("--test1", res);
@@ -356,25 +352,25 @@ TEST_F(TApp, BasicOptionGroupScrewedUpMove) {
     auto ogroup = app.add_option_group("clusters");
     ogroup->require_option();
     auto ogroup2 = ogroup->add_option_group("clusters2");
-    EXPECT_THROW(ogroup2->add_options(opt1, opt2), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup2->add_options(opt1, opt2), CLI::OptionNotFound);
 
     CLI::Option_group EmptyGroup("description", "new group", nullptr);
 
-    EXPECT_THROW(EmptyGroup.add_option(opt2), CLI::OptionNotFound);
-    EXPECT_THROW(app._move_option(opt2, ogroup2), CLI::OptionNotFound);
+    CHECK_THROWS_AS(EmptyGroup.add_option(opt2), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app._move_option(opt2, ogroup2), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, InvalidOptions) {
+TEST_CASE_METHOD(TApp, "InvalidOptions", "[optiongroup]") {
     auto ogroup = app.add_option_group("clusters");
     CLI::Option *opt = nullptr;
-    EXPECT_THROW(ogroup->excludes(opt), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->excludes(opt), CLI::OptionNotFound);
     CLI::App *app_p = nullptr;
-    EXPECT_THROW(ogroup->excludes(app_p), CLI::OptionNotFound);
-    EXPECT_THROW(ogroup->excludes(ogroup), CLI::OptionNotFound);
-    EXPECT_THROW(ogroup->add_option(opt), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->excludes(app_p), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->excludes(ogroup), CLI::OptionNotFound);
+    CHECK_THROWS_AS(ogroup->add_option(opt), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, OptionGroupInheritedOptionDefaults) {
+TEST_CASE_METHOD(TApp, "OptionGroupInheritedOptionDefaults", "[optiongroup]") {
     app.option_defaults()->ignore_case();
     auto ogroup = app.add_option_group("clusters");
     int res{0};
@@ -382,8 +378,8 @@ TEST_F(TApp, OptionGroupInheritedOptionDefaults) {
 
     args = {"--Test1", "5"};
     run();
-    EXPECT_EQ(res, 5);
-    EXPECT_EQ(app.count_all(), 1u);
+    CHECK(5 == res);
+    CHECK(1u == app.count_all());
 }
 
 struct ManyGroups : public TApp {
@@ -425,48 +421,48 @@ struct ManyGroups : public TApp {
     }
 };
 
-TEST_F(ManyGroups, SingleGroup) {
+TEST_CASE_METHOD(ManyGroups, "SingleGroup", "[optiongroup]") {
     // only 1 group can be used
     main->require_option(1);
     args = {"--name1", "test"};
     run();
-    EXPECT_EQ(name1, "test");
+    CHECK("test" == name1);
 
     args = {"--name2", "test", "--val2", "tval"};
 
     run();
-    EXPECT_EQ(val2, "tval");
+    CHECK("tval" == val2);
 
     args = {"--name1", "test", "--val2", "tval"};
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManyGroups, ExcludesGroup) {
+TEST_CASE_METHOD(ManyGroups, "ExcludesGroup", "[optiongroup]") {
     // only 1 group can be used
     g1->excludes(g2);
     g1->excludes(g3);
     args = {"--name1", "test"};
     run();
-    EXPECT_EQ(name1, "test");
+    CHECK("test" == name1);
 
     args = {"--name1", "test", "--name2", "test2"};
 
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 
-    EXPECT_TRUE(g1->remove_excludes(g2));
-    EXPECT_NO_THROW(run());
-    EXPECT_FALSE(g1->remove_excludes(g1));
-    EXPECT_FALSE(g1->remove_excludes(g2));
+    CHECK(g1->remove_excludes(g2));
+    CHECK_NOTHROW(run());
+    CHECK(!g1->remove_excludes(g1));
+    CHECK(!g1->remove_excludes(g2));
 }
 
-TEST_F(ManyGroups, NeedsGroup) {
+TEST_CASE_METHOD(ManyGroups, "NeedsGroup", "[optiongroup]") {
     remove_required();
     // all groups needed if g1 is used
     g1->needs(g2);
     g1->needs(g3);
     args = {"--name1", "test"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
     // other groups should run fine
     args = {"--name2", "test2"};
 
@@ -474,11 +470,11 @@ TEST_F(ManyGroups, NeedsGroup) {
     // all three groups should be fine
     args = {"--name1", "test", "--name2", "test2", "--name3", "test3"};
 
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
 // test adding an option group with existing subcommands to an app
-TEST_F(TApp, ExistingSubcommandMatch) {
+TEST_CASE_METHOD(TApp, "ExistingSubcommandMatch", "[optiongroup]") {
     auto sshared = std::make_shared<CLI::Option_group>("documenting the subcommand", "sub1g", nullptr);
     auto s1 = sshared->add_subcommand("sub1");
     auto o1 = sshared->add_option_group("opt1");
@@ -489,9 +485,9 @@ TEST_F(TApp, ExistingSubcommandMatch) {
     try {
         app.add_subcommand(sshared);
         // this should throw the next line should never be reached
-        EXPECT_FALSE(true);
+        CHECK(!true);
     } catch(const CLI::OptionAlreadyAdded &oaa) {
-        EXPECT_THAT(oaa.what(), HasSubstr("sub1"));
+        CHECK_THAT(oaa.what(), Contains("sub1"));
     }
     sshared->remove_subcommand(s1);
 
@@ -500,40 +496,40 @@ TEST_F(TApp, ExistingSubcommandMatch) {
     try {
         app.add_subcommand(sshared);
         // this should throw the next line should never be reached
-        EXPECT_FALSE(true);
+        CHECK(!true);
     } catch(const CLI::OptionAlreadyAdded &oaa) {
-        EXPECT_THAT(oaa.what(), HasSubstr("sub3"));
+        CHECK_THAT(oaa.what(), Contains("sub3"));
     }
 }
 
-TEST_F(ManyGroups, SingleGroupError) {
+TEST_CASE_METHOD(ManyGroups, "SingleGroupError", "[optiongroup]") {
     // only 1 group can be used
     main->require_option(1);
     args = {"--name1", "test", "--name2", "test3"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManyGroups, AtMostOneGroup) {
+TEST_CASE_METHOD(ManyGroups, "AtMostOneGroup", "[optiongroup]") {
     // only 1 group can be used
     main->require_option(0, 1);
     args = {"--name1", "test", "--name2", "test3"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManyGroups, AtLeastTwoGroups) {
+TEST_CASE_METHOD(ManyGroups, "AtLeastTwoGroups", "[optiongroup]") {
     // only 1 group can be used
     main->require_option(2, 0);
     args = {"--name1", "test", "--name2", "test3"};
     run();
 
     args = {"--name1", "test"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManyGroups, BetweenOneAndTwoGroups) {
+TEST_CASE_METHOD(ManyGroups, "BetweenOneAndTwoGroups", "[optiongroup]") {
     // only 1 group can be used
     main->require_option(1, 2);
     args = {"--name1", "test", "--name2", "test3"};
@@ -543,19 +539,19 @@ TEST_F(ManyGroups, BetweenOneAndTwoGroups) {
     run();
 
     args = {};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"--name1", "test", "--name2", "test3", "--name3=test3"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManyGroups, RequiredFirst) {
+TEST_CASE_METHOD(ManyGroups, "RequiredFirst", "[optiongroup]") {
     // only 1 group can be used
     remove_required();
     g1->required();
 
-    EXPECT_TRUE(g1->get_required());
-    EXPECT_FALSE(g2->get_required());
+    CHECK(g1->get_required());
+    CHECK(!g2->get_required());
     args = {"--name1", "test", "--name2", "test3"};
     run();
 
@@ -563,32 +559,32 @@ TEST_F(ManyGroups, RequiredFirst) {
     try {
         run();
     } catch(const CLI::RequiredError &re) {
-        EXPECT_THAT(re.what(), HasSubstr("g1"));
+        CHECK_THAT(re.what(), Contains("g1"));
     }
 
     args = {"--name1", "test", "--name2", "test3", "--name3=test3"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManyGroups, DisableFirst) {
+TEST_CASE_METHOD(ManyGroups, "DisableFirst", "[optiongroup]") {
     // only 1 group can be used if remove_required not used
     remove_required();
     g1->disabled();
 
-    EXPECT_TRUE(g1->get_disabled());
-    EXPECT_FALSE(g2->get_disabled());
+    CHECK(g1->get_disabled());
+    CHECK(!g2->get_disabled());
     args = {"--name2", "test"};
 
     run();
 
     args = {"--name1", "test", "--name2", "test3"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     g1->disabled(false);
     args = {"--name1", "test", "--name2", "test3", "--name3=test3"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManyGroups, SameSubcommand) {
+TEST_CASE_METHOD(ManyGroups, "SameSubcommand", "[optiongroup]") {
     // only 1 group can be used if remove_required not used
     remove_required();
     auto sub1 = g1->add_subcommand("sub1")->disabled();
@@ -602,30 +598,30 @@ TEST_F(ManyGroups, SameSubcommand) {
 
     run();
 
-    EXPECT_TRUE(*sub1);
-    EXPECT_TRUE(*sub2);
-    EXPECT_TRUE(*sub3);
+    CHECK(*sub1);
+    CHECK(*sub2);
+    CHECK(*sub3);
     auto subs = app.get_subcommands();
-    EXPECT_EQ(subs.size(), 3u);
-    EXPECT_EQ(subs[0], sub1);
-    EXPECT_EQ(subs[1], sub2);
-    EXPECT_EQ(subs[2], sub3);
+    CHECK(3u == subs.size());
+    CHECK(sub1 == subs[0]);
+    CHECK(sub2 == subs[1]);
+    CHECK(sub3 == subs[2]);
 
     args = {"sub1", "sub1", "sub1", "sub1"};
     // for the 4th and future ones they will route to the first one
     run();
-    EXPECT_EQ(sub1->count(), 2u);
-    EXPECT_EQ(sub2->count(), 1u);
-    EXPECT_EQ(sub3->count(), 1u);
+    CHECK(2u == sub1->count());
+    CHECK(1u == sub2->count());
+    CHECK(1u == sub3->count());
 
     // subs should remain the same since the duplicate would not be registered there
     subs = app.get_subcommands();
-    EXPECT_EQ(subs.size(), 3u);
-    EXPECT_EQ(subs[0], sub1);
-    EXPECT_EQ(subs[1], sub2);
-    EXPECT_EQ(subs[2], sub3);
+    CHECK(3u == subs.size());
+    CHECK(sub1 == subs[0]);
+    CHECK(sub2 == subs[1]);
+    CHECK(sub3 == subs[2]);
 }
-TEST_F(ManyGroups, CallbackOrder) {
+TEST_CASE_METHOD(ManyGroups, "CallbackOrder", "[optiongroup]") {
     // only 1 group can be used if remove_required not used
     remove_required();
     std::vector<int> callback_order;
@@ -635,61 +631,61 @@ TEST_F(ManyGroups, CallbackOrder) {
 
     args = {"--name2", "test"};
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({2, 3}));
+    CHECK(std::vector<int>({2, 3}) == callback_order);
 
     callback_order.clear();
     args = {"--name1", "t2", "--name2", "test"};
     g2->immediate_callback();
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({2, 1, 3}));
+    CHECK(std::vector<int>({2, 1, 3}) == callback_order);
     callback_order.clear();
 
     args = {"--name2", "test", "--name1", "t2"};
     g2->immediate_callback(false);
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == callback_order);
 }
 
 // Test the fallthrough for extra arguments
-TEST_F(ManyGroups, ExtrasFallDown) {
+TEST_CASE_METHOD(ManyGroups, "ExtrasFallDown", "[optiongroup]") {
     // only 1 group can be used if remove_required not used
     remove_required();
 
     args = {"--test1", "--flag", "extra"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     main->allow_extras();
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
-    EXPECT_EQ(app.remaining_size(true), 3u);
-    EXPECT_EQ(main->remaining_size(), 3u);
+    CHECK(3u == app.remaining_size(true));
+    CHECK(3u == main->remaining_size());
 
     std::vector<std::string> extras{"--test1", "--flag", "extra"};
-    EXPECT_EQ(app.remaining(true), extras);
-    EXPECT_EQ(main->remaining(), extras);
+    CHECK(extras == app.remaining(true));
+    CHECK(extras == main->remaining());
 }
 
 // Test the option Inheritance
-TEST_F(ManyGroups, Inheritance) {
+TEST_CASE_METHOD(ManyGroups, "Inheritance", "[optiongroup]") {
     remove_required();
     g1->ignore_case();
     g1->ignore_underscore();
     auto t2 = g1->add_subcommand("t2");
     args = {"T2", "t_2"};
-    EXPECT_TRUE(t2->get_ignore_underscore());
-    EXPECT_TRUE(t2->get_ignore_case());
+    CHECK(t2->get_ignore_underscore());
+    CHECK(t2->get_ignore_case());
     run();
-    EXPECT_EQ(t2->count(), 2u);
+    CHECK(2u == t2->count());
 }
 
-TEST_F(ManyGroups, Moving) {
+TEST_CASE_METHOD(ManyGroups, "Moving", "[optiongroup]") {
     remove_required();
     auto mg = app.add_option_group("maing");
     mg->add_subcommand(g1);
     mg->add_subcommand(g2);
 
-    EXPECT_EQ(g1->get_parent(), mg);
-    EXPECT_EQ(g2->get_parent(), mg);
-    EXPECT_EQ(g3->get_parent(), main);
+    CHECK(mg == g1->get_parent());
+    CHECK(mg == g2->get_parent());
+    CHECK(main == g3->get_parent());
 }
 
 struct ManyGroupsPreTrigger : public ManyGroups {
@@ -704,35 +700,35 @@ struct ManyGroupsPreTrigger : public ManyGroups {
     }
 };
 
-TEST_F(ManyGroupsPreTrigger, PreTriggerTestsOptions) {
+TEST_CASE_METHOD(ManyGroupsPreTrigger, "PreTriggerTestsOptions", "[optiongroup]") {
 
     args = {"--name1", "test", "--name2", "test3"};
     run();
-    EXPECT_EQ(triggerMain, 4u);
-    EXPECT_EQ(trigger1, 2u);
-    EXPECT_EQ(trigger2, 0u);
-    EXPECT_EQ(trigger3, 27u);
+    CHECK(4u == triggerMain);
+    CHECK(2u == trigger1);
+    CHECK(0u == trigger2);
+    CHECK(27u == trigger3);
 
     args = {"--name1", "test"};
     trigger2 = 34u;
     run();
-    EXPECT_EQ(triggerMain, 2u);
-    EXPECT_EQ(trigger1, 0u);
-    EXPECT_EQ(trigger2, 34u);
+    CHECK(2u == triggerMain);
+    CHECK(0u == trigger1);
+    CHECK(34u == trigger2);
 
     args = {};
     run();
-    EXPECT_EQ(triggerMain, 0u);
+    CHECK(0u == triggerMain);
 
     args = {"--name1", "test", "--val1", "45", "--name2", "test3", "--name3=test3", "--val2=37"};
     run();
-    EXPECT_EQ(triggerMain, 8u);
-    EXPECT_EQ(trigger1, 6u);
-    EXPECT_EQ(trigger2, 2u);
-    EXPECT_EQ(trigger3, 1u);
+    CHECK(8u == triggerMain);
+    CHECK(6u == trigger1);
+    CHECK(2u == trigger2);
+    CHECK(1u == trigger3);
 }
 
-TEST_F(ManyGroupsPreTrigger, PreTriggerTestsPositionals) {
+TEST_CASE_METHOD(ManyGroupsPreTrigger, "PreTriggerTestsPositionals", "[optiongroup]") {
     // only 1 group can be used
     g1->add_option("pos1");
     g2->add_option("pos2");
@@ -740,26 +736,26 @@ TEST_F(ManyGroupsPreTrigger, PreTriggerTestsPositionals) {
 
     args = {"pos1"};
     run();
-    EXPECT_EQ(triggerMain, 1u);
-    EXPECT_EQ(trigger1, 0u);
-    EXPECT_EQ(trigger2, 34u);
-    EXPECT_EQ(trigger3, 27u);
+    CHECK(1u == triggerMain);
+    CHECK(0u == trigger1);
+    CHECK(34u == trigger2);
+    CHECK(27u == trigger3);
 
     args = {"pos1", "pos2"};
     run();
-    EXPECT_EQ(triggerMain, 2u);
-    EXPECT_EQ(trigger1, 1u);
-    EXPECT_EQ(trigger2, 0u);
+    CHECK(2u == triggerMain);
+    CHECK(1u == trigger1);
+    CHECK(0u == trigger2);
 
     args = {"pos1", "pos2", "pos3"};
     run();
-    EXPECT_EQ(triggerMain, 3u);
-    EXPECT_EQ(trigger1, 2u);
-    EXPECT_EQ(trigger2, 1u);
-    EXPECT_EQ(trigger3, 0u);
+    CHECK(3u == triggerMain);
+    CHECK(2u == trigger1);
+    CHECK(1u == trigger2);
+    CHECK(0u == trigger3);
 }
 
-TEST_F(ManyGroupsPreTrigger, PreTriggerTestsSubcommand) {
+TEST_CASE_METHOD(ManyGroupsPreTrigger, "PreTriggerTestsSubcommand", "[optiongroup]") {
 
     auto sub1 = g1->add_subcommand("sub1")->fallthrough();
     g2->add_subcommand("sub2")->fallthrough();
@@ -769,23 +765,23 @@ TEST_F(ManyGroupsPreTrigger, PreTriggerTestsSubcommand) {
     sub1->preparse_callback([&subtrigger](std::size_t count) { subtrigger = count; });
     args = {"sub1"};
     run();
-    EXPECT_EQ(triggerMain, 1u);
-    EXPECT_EQ(trigger1, 0u);
-    EXPECT_EQ(trigger2, 34u);
-    EXPECT_EQ(trigger3, 27u);
+    CHECK(1u == triggerMain);
+    CHECK(0u == trigger1);
+    CHECK(34u == trigger2);
+    CHECK(27u == trigger3);
 
     args = {"sub1", "sub2"};
     run();
-    EXPECT_EQ(triggerMain, 2u);
-    EXPECT_EQ(subtrigger, 1u);
-    EXPECT_EQ(trigger1, 1u);
-    EXPECT_EQ(trigger2, 0u);
+    CHECK(2u == triggerMain);
+    CHECK(1u == subtrigger);
+    CHECK(1u == trigger1);
+    CHECK(0u == trigger2);
 
     args = {"sub2", "sub3", "--name1=test", "sub1"};
     run();
-    EXPECT_EQ(triggerMain, 4u);
-    EXPECT_EQ(trigger1, 1u);
-    EXPECT_EQ(trigger2, 3u);
-    EXPECT_EQ(trigger3, 1u);  // processes the first argument in group3 which includes the entire subcommand, which will
-                              // go until the sub1 command is given
+    CHECK(4u == triggerMain);
+    CHECK(1u == trigger1);
+    CHECK(3u == trigger2);
+    CHECK(1u == trigger3);
+    // go until the sub1 command is given
 }
diff --git a/packages/CLI11/tests/OptionTypeTest.cpp b/packages/CLI11/tests/OptionTypeTest.cpp
index bbf71fdfb4a962984169895e6557b45fd2bc4805..a2f893bec598d56cbf17bd76c1de5c62b83eb19d 100644
--- a/packages/CLI11/tests/OptionTypeTest.cpp
+++ b/packages/CLI11/tests/OptionTypeTest.cpp
@@ -19,46 +19,44 @@
 #include <unordered_set>
 #include <vector>
 
-#include "gmock/gmock.h"
-
-TEST_F(TApp, OneStringAgain) {
+TEST_CASE_METHOD(TApp, "OneStringAgain", "[optiontype]") {
     std::string str;
     app.add_option("-s,--string", str);
     args = {"--string", "mystring"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, OneStringFunction) {
+TEST_CASE_METHOD(TApp, "OneStringFunction", "[optiontype]") {
     std::string str;
     app.add_option_function<std::string>("-s,--string", [&str](const std::string &val) { str = val; });
     args = {"--string", "mystring"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--string"));
-    EXPECT_EQ(str, "mystring");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--string") == 1u);
+    CHECK("mystring" == str);
 }
 
-TEST_F(TApp, doubleFunction) {
+TEST_CASE_METHOD(TApp, "doubleFunction", "[optiontype]") {
     double res{0.0};
     app.add_option_function<double>("--val", [&res](double val) { res = std::abs(val + 54); });
     args = {"--val", "-354.356"};
     run();
-    EXPECT_EQ(res, 300.356);
+    CHECK(300.356 == res);
     // get the original value as entered as an integer
-    EXPECT_EQ(app["--val"]->as<float>(), -354.356f);
+    CHECK(-354.356f == app["--val"]->as<float>());
 }
 
-TEST_F(TApp, doubleFunctionFail) {
+TEST_CASE_METHOD(TApp, "doubleFunctionFail", "[optiontype]") {
     double res;
     app.add_option_function<double>("--val", [&res](double val) { res = std::abs(val + 54); });
     args = {"--val", "not_double"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, doubleVectorFunction) {
+TEST_CASE_METHOD(TApp, "doubleVectorFunction", "[optiontype]") {
     std::vector<double> res;
     app.add_option_function<std::vector<double>>("--val", [&res](const std::vector<double> &val) {
         res = val;
@@ -66,12 +64,12 @@ TEST_F(TApp, doubleVectorFunction) {
     });
     args = {"--val", "5", "--val", "6", "--val", "7"};
     run();
-    EXPECT_EQ(res.size(), 3u);
-    EXPECT_EQ(res[0], 10.0);
-    EXPECT_EQ(res[2], 12.0);
+    CHECK(3u == res.size());
+    CHECK(10.0 == res[0]);
+    CHECK(12.0 == res[2]);
 }
 
-TEST_F(TApp, doubleVectorFunctionFail) {
+TEST_CASE_METHOD(TApp, "doubleVectorFunctionFail", "[optiontype]") {
     std::vector<double> res;
     std::string vstring = "--val";
     app.add_option_function<std::vector<double>>(vstring, [&res](const std::vector<double> &val) {
@@ -79,14 +77,14 @@ TEST_F(TApp, doubleVectorFunctionFail) {
         std::transform(res.begin(), res.end(), res.begin(), [](double v) { return v + 5.0; });
     });
     args = {"--val", "five", "--val", "nine", "--val", "7"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
     // check that getting the results through the results function generates the same error
-    EXPECT_THROW(app[vstring]->results(res), CLI::ConversionError);
+    CHECK_THROWS_AS(app[vstring]->results(res), CLI::ConversionError);
     auto strvec = app[vstring]->as<std::vector<std::string>>();
-    EXPECT_EQ(strvec.size(), 3u);
+    CHECK(3u == strvec.size());
 }
 
-TEST_F(TApp, doubleVectorFunctionRunCallbackOnDefault) {
+TEST_CASE_METHOD(TApp, "doubleVectorFunctionRunCallbackOnDefault", "[optiontype]") {
     std::vector<double> res;
     auto opt = app.add_option_function<std::vector<double>>("--val", [&res](const std::vector<double> &val) {
         res = val;
@@ -94,26 +92,26 @@ TEST_F(TApp, doubleVectorFunctionRunCallbackOnDefault) {
     });
     args = {"--val", "5", "--val", "6", "--val", "7"};
     run();
-    EXPECT_EQ(res.size(), 3u);
-    EXPECT_EQ(res[0], 10.0);
-    EXPECT_EQ(res[2], 12.0);
-    EXPECT_FALSE(opt->get_run_callback_for_default());
+    CHECK(3u == res.size());
+    CHECK(10.0 == res[0]);
+    CHECK(12.0 == res[2]);
+    CHECK(!opt->get_run_callback_for_default());
     opt->run_callback_for_default();
     opt->default_val(std::vector<int>{2, 1, -2});
-    EXPECT_EQ(res[0], 7.0);
-    EXPECT_EQ(res[2], 3.0);
+    CHECK(7.0 == res[0]);
+    CHECK(3.0 == res[2]);
 
-    EXPECT_THROW(opt->default_val("this is a string"), CLI::ConversionError);
+    CHECK_THROWS_AS(opt->default_val("this is a string"), CLI::ConversionError);
     auto vec = opt->as<std::vector<double>>();
-    ASSERT_EQ(vec.size(), 3U);
-    EXPECT_EQ(vec[0], 5.0);
-    EXPECT_EQ(vec[2], 7.0);
+    REQUIRE(3U == vec.size());
+    CHECK(5.0 == vec[0]);
+    CHECK(7.0 == vec[2]);
     opt->check(CLI::Number);
     opt->run_callback_for_default(false);
-    EXPECT_THROW(opt->default_val("this is a string"), CLI::ValidationError);
+    CHECK_THROWS_AS(opt->default_val("this is a string"), CLI::ValidationError);
 }
 
-TEST_F(TApp, BoolAndIntFlags) {
+TEST_CASE_METHOD(TApp, "BoolAndIntFlags", "[optiontype]") {
 
     bool bflag{false};
     int iflag{0};
@@ -125,24 +123,24 @@ TEST_F(TApp, BoolAndIntFlags) {
 
     args = {"-b", "-i", "-u"};
     run();
-    EXPECT_TRUE(bflag);
-    EXPECT_EQ(1, iflag);
-    EXPECT_EQ((unsigned int)1, uflag);
+    CHECK(bflag);
+    CHECK(iflag == 1);
+    CHECK(uflag == (unsigned int)1);
 
     args = {"-b", "-b"};
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(bflag);
+    REQUIRE_NOTHROW(run());
+    CHECK(bflag);
 
     bflag = false;
 
     args = {"-iiiuu"};
     run();
-    EXPECT_FALSE(bflag);
-    EXPECT_EQ(3, iflag);
-    EXPECT_EQ((unsigned int)2, uflag);
+    CHECK(!bflag);
+    CHECK(iflag == 3);
+    CHECK(uflag == (unsigned int)2);
 }
 
-TEST_F(TApp, atomic_bool_flags) {
+TEST_CASE_METHOD(TApp, "atomic_bool_flags", "[optiontype]") {
 
     std::atomic<bool> bflag{false};
     std::atomic<int> iflag{0};
@@ -152,95 +150,95 @@ TEST_F(TApp, atomic_bool_flags) {
 
     args = {"-b", "-i"};
     run();
-    EXPECT_TRUE(bflag.load());
-    EXPECT_EQ(1, iflag.load());
+    CHECK(bflag.load());
+    CHECK(iflag.load() == 1);
 
     args = {"-b", "-b"};
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(bflag.load());
+    REQUIRE_NOTHROW(run());
+    CHECK(bflag.load());
 
     bflag = false;
 
     args = {"-iii"};
     run();
-    EXPECT_FALSE(bflag.load());
-    EXPECT_EQ(3, iflag.load());
+    CHECK(!bflag.load());
+    CHECK(iflag.load() == 3);
     args = {"--int=notanumber"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, BoolOption) {
+TEST_CASE_METHOD(TApp, "BoolOption", "[optiontype]") {
     bool bflag{false};
     app.add_option("-b", bflag);
 
     args = {"-b", "false"};
     run();
-    EXPECT_FALSE(bflag);
+    CHECK(!bflag);
 
     args = {"-b", "1"};
     run();
-    EXPECT_TRUE(bflag);
+    CHECK(bflag);
 
     args = {"-b", "-7"};
     run();
-    EXPECT_FALSE(bflag);
+    CHECK(!bflag);
 
     // cause an out of bounds error internally
     args = {"-b", "751615654161688126132138844896646748852"};
     run();
-    EXPECT_TRUE(bflag);
+    CHECK(bflag);
 
     args = {"-b", "-751615654161688126132138844896646748852"};
     run();
-    EXPECT_FALSE(bflag);
+    CHECK(!bflag);
 }
 
-TEST_F(TApp, atomic_int_option) {
+TEST_CASE_METHOD(TApp, "atomic_int_option", "[optiontype]") {
     std::atomic<int> i{0};
     auto aopt = app.add_option("-i,--int", i);
     args = {"-i4"};
     run();
-    EXPECT_EQ(1u, app.count("--int"));
-    EXPECT_EQ(1u, app.count("-i"));
-    EXPECT_EQ(i, 4);
-    EXPECT_EQ(app["-i"]->as<std::string>(), "4");
-    EXPECT_EQ(app["--int"]->as<double>(), 4.0);
+    CHECK(app.count("--int") == 1u);
+    CHECK(app.count("-i") == 1u);
+    CHECK(4 == i);
+    CHECK("4" == app["-i"]->as<std::string>());
+    CHECK(4.0 == app["--int"]->as<double>());
 
     args = {"--int", "notAnInt"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     aopt->expected(0, 1);
     args = {"--int"};
     run();
-    EXPECT_EQ(i, 0);
+    CHECK(0 == i);
 }
 
-TEST_F(TApp, CharOption) {
+TEST_CASE_METHOD(TApp, "CharOption", "[optiontype]") {
     char c1{'t'};
     app.add_option("-c", c1);
 
     args = {"-c", "g"};
     run();
-    EXPECT_EQ(c1, 'g');
+    CHECK('g' == c1);
 
     args = {"-c", "1"};
     run();
-    EXPECT_EQ(c1, '1');
+    CHECK('1' == c1);
 
     args = {"-c", "77"};
     run();
-    EXPECT_EQ(c1, 77);
+    CHECK(77 == c1);
 
     // convert hex for digit
     args = {"-c", "0x44"};
     run();
-    EXPECT_EQ(c1, 0x44);
+    CHECK(0x44 == c1);
 
     args = {"-c", "751615654161688126132138844896646748852"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, vectorDefaults) {
+TEST_CASE_METHOD(TApp, "vectorDefaults", "[optiontype]") {
     std::vector<int> vals{4, 5};
     auto opt = app.add_option("--long", vals, "", true);
 
@@ -248,30 +246,30 @@ TEST_F(TApp, vectorDefaults) {
 
     run();
 
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
 
     args.clear();
     run();
     auto res = app["--long"]->as<std::vector<int>>();
-    EXPECT_EQ(res, std::vector<int>({4, 5}));
+    CHECK(std::vector<int>({4, 5}) == res);
 
     app.clear();
     opt->expected(1)->take_last();
     res = app["--long"]->as<std::vector<int>>();
-    EXPECT_EQ(res, std::vector<int>({5}));
+    CHECK(std::vector<int>({5}) == res);
     opt->take_first();
     res = app["--long"]->as<std::vector<int>>();
-    EXPECT_EQ(res, std::vector<int>({4}));
+    CHECK(std::vector<int>({4}) == res);
 
     opt->expected(0, 1)->take_last();
     run();
 
-    EXPECT_EQ(res, std::vector<int>({4}));
+    CHECK(std::vector<int>({4}) == res);
     res = app["--long"]->as<std::vector<int>>();
-    EXPECT_EQ(res, std::vector<int>({5}));
+    CHECK(std::vector<int>({5}) == res);
 }
 
-TEST_F(TApp, CallbackBoolFlags) {
+TEST_CASE_METHOD(TApp, "CallbackBoolFlags", "[optiontype]") {
 
     bool value{false};
 
@@ -280,24 +278,24 @@ TEST_F(TApp, CallbackBoolFlags) {
     auto cback = app.add_flag_callback("--val", func);
     args = {"--val"};
     run();
-    EXPECT_TRUE(value);
+    CHECK(value);
     value = false;
     args = {"--val=false"};
     run();
-    EXPECT_FALSE(value);
+    CHECK(!value);
 
-    EXPECT_THROW(app.add_flag_callback("hi", func), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_flag_callback("hi", func), CLI::IncorrectConstruction);
     cback->multi_option_policy(CLI::MultiOptionPolicy::Throw);
     args = {"--val", "--val=false"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, pair_check) {
+TEST_CASE_METHOD(TApp, "pair_check", "[optiontype]") {
     std::string myfile{"pair_check_file.txt"};
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
+    CHECK(ok);
 
-    EXPECT_TRUE(CLI::ExistingFile(myfile).empty());
+    CHECK(CLI::ExistingFile(myfile).empty());
     std::pair<std::string, int> findex;
 
     auto v0 = CLI::ExistingFile;
@@ -308,112 +306,112 @@ TEST_F(TApp, pair_check) {
 
     args = {"--file", myfile, "2"};
 
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
-    EXPECT_EQ(findex.first, myfile);
-    EXPECT_EQ(findex.second, 2);
+    CHECK(myfile == findex.first);
+    CHECK(2 == findex.second);
 
     args = {"--file", myfile, "-3"};
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--file", myfile, "2"};
     std::remove(myfile.c_str());
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
 // this will require that modifying the multi-option policy for tuples be allowed which it isn't at present
 
-TEST_F(TApp, pair_check_take_first) {
+TEST_CASE_METHOD(TApp, "pair_check_take_first", "[optiontype]") {
     std::string myfile{"pair_check_file2.txt"};
     bool ok = static_cast<bool>(std::ofstream(myfile.c_str()).put('a'));  // create file
-    EXPECT_TRUE(ok);
+    CHECK(ok);
 
-    EXPECT_TRUE(CLI::ExistingFile(myfile).empty());
+    CHECK(CLI::ExistingFile(myfile).empty());
     std::pair<std::string, int> findex;
 
     auto opt = app.add_option("--file", findex)->check(CLI::ExistingFile)->check(CLI::PositiveNumber);
-    EXPECT_THROW(opt->get_validator(3), CLI::OptionNotFound);
+    CHECK_THROWS_AS(opt->get_validator(3), CLI::OptionNotFound);
     opt->get_validator(0)->application_index(0);
     opt->get_validator(1)->application_index(1);
     opt->multi_option_policy(CLI::MultiOptionPolicy::TakeLast);
     args = {"--file", "not_a_file.txt", "-16", "--file", myfile, "2"};
     // should only check the last one
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
-    EXPECT_EQ(findex.first, myfile);
-    EXPECT_EQ(findex.second, 2);
+    CHECK(myfile == findex.first);
+    CHECK(2 == findex.second);
 
     opt->multi_option_policy(CLI::MultiOptionPolicy::TakeFirst);
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, VectorFixedString) {
+TEST_CASE_METHOD(TApp, "VectorFixedString", "[optiontype]") {
     std::vector<std::string> strvec;
     std::vector<std::string> answer{"mystring", "mystring2", "mystring3"};
 
     CLI::Option *opt = app.add_option("-s,--string", strvec)->expected(3);
-    EXPECT_EQ(3, opt->get_expected());
+    CHECK(opt->get_expected() == 3);
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 }
 
-TEST_F(TApp, VectorDefaultedFixedString) {
+TEST_CASE_METHOD(TApp, "VectorDefaultedFixedString", "[optiontype]") {
     std::vector<std::string> strvec{"one"};
     std::vector<std::string> answer{"mystring", "mystring2", "mystring3"};
 
     CLI::Option *opt = app.add_option("-s,--string", strvec, "")->expected(3)->capture_default_str();
-    EXPECT_EQ(3, opt->get_expected());
+    CHECK(opt->get_expected() == 3);
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 }
 
-TEST_F(TApp, VectorIndexedValidator) {
+TEST_CASE_METHOD(TApp, "VectorIndexedValidator", "[optiontype]") {
     std::vector<int> vvec;
 
     CLI::Option *opt = app.add_option("-v", vvec);
 
     args = {"-v", "1", "-1", "-v", "3", "-v", "-976"};
     run();
-    EXPECT_EQ(4u, app.count("-v"));
-    EXPECT_EQ(4u, vvec.size());
+    CHECK(app.count("-v") == 4u);
+    CHECK(vvec.size() == 4u);
     opt->check(CLI::PositiveNumber.application_index(0));
     opt->check((!CLI::PositiveNumber).application_index(1));
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(4u, vvec.size());
+    CHECK_NOTHROW(run());
+    CHECK(vvec.size() == 4u);
     // v[3] would be negative
     opt->check(CLI::PositiveNumber.application_index(3));
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, VectorUnlimString) {
+TEST_CASE_METHOD(TApp, "VectorUnlimString", "[optiontype]") {
     std::vector<std::string> strvec;
     std::vector<std::string> answer{"mystring", "mystring2", "mystring3"};
 
     CLI::Option *opt = app.add_option("-s,--string", strvec);
-    EXPECT_EQ(1, opt->get_expected());
-    EXPECT_EQ(CLI::detail::expected_max_vector_size, opt->get_expected_max());
+    CHECK(opt->get_expected() == 1);
+    CHECK(opt->get_expected_max() == CLI::detail::expected_max_vector_size);
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 
     args = {"-s", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 }
 
 // From https://github.com/CLIUtils/CLI11/issues/420
-TEST_F(TApp, stringLikeTests) {
+TEST_CASE_METHOD(TApp, "stringLikeTests", "[optiontype]") {
     struct nType {
         explicit nType(const std::string &a_value) : m_value{a_value} {}
 
@@ -426,14 +424,14 @@ TEST_F(TApp, stringLikeTests) {
     app.add_option("--type", m_type, "type")->capture_default_str();
     run();
 
-    EXPECT_EQ(app["--type"]->as<std::string>(), "op str");
+    CHECK("op str" == app["--type"]->as<std::string>());
     args = {"--type", "bca"};
     run();
-    EXPECT_EQ(std::string(m_type), "op str");
-    EXPECT_EQ(m_type.m_value, "bca");
+    CHECK("op str" == std::string(m_type));
+    CHECK("bca" == m_type.m_value);
 }
 
-TEST_F(TApp, VectorExpectedRange) {
+TEST_CASE_METHOD(TApp, "VectorExpectedRange", "[optiontype]") {
     std::vector<std::string> strvec;
 
     CLI::Option *opt = app.add_option("--string", strvec);
@@ -441,47 +439,47 @@ TEST_F(TApp, VectorExpectedRange) {
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
+    CHECK(app.count("--string") == 3u);
 
     args = {"--string", "mystring"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--string", "mystring", "mystring2", "string2", "--string", "string4", "string5"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
-    EXPECT_EQ(opt->get_expected_max(), 4);
-    EXPECT_EQ(opt->get_expected_min(), 2);
+    CHECK(4 == opt->get_expected_max());
+    CHECK(2 == opt->get_expected_min());
     opt->expected(4, 2);  // just test the handling of reversed arguments
-    EXPECT_EQ(opt->get_expected_max(), 4);
-    EXPECT_EQ(opt->get_expected_min(), 2);
+    CHECK(4 == opt->get_expected_max());
+    CHECK(2 == opt->get_expected_min());
     opt->expected(-5);
-    EXPECT_EQ(opt->get_expected_max(), 5);
-    EXPECT_EQ(opt->get_expected_min(), 5);
+    CHECK(5 == opt->get_expected_max());
+    CHECK(5 == opt->get_expected_min());
     opt->expected(-5, 7);
-    EXPECT_EQ(opt->get_expected_max(), 7);
-    EXPECT_EQ(opt->get_expected_min(), 5);
+    CHECK(7 == opt->get_expected_max());
+    CHECK(5 == opt->get_expected_min());
 }
 
-TEST_F(TApp, VectorFancyOpts) {
+TEST_CASE_METHOD(TApp, "VectorFancyOpts", "[optiontype]") {
     std::vector<std::string> strvec;
     std::vector<std::string> answer{"mystring", "mystring2", "mystring3"};
 
     CLI::Option *opt = app.add_option("-s,--string", strvec)->required()->expected(3);
-    EXPECT_EQ(3, opt->get_expected());
+    CHECK(opt->get_expected() == 3);
 
     args = {"--string", "mystring", "mystring2", "mystring3"};
     run();
-    EXPECT_EQ(3u, app.count("--string"));
-    EXPECT_EQ(answer, strvec);
+    CHECK(app.count("--string") == 3u);
+    CHECK(strvec == answer);
 
     args = {"one", "two"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
-    EXPECT_THROW(run(), CLI::ParseError);
+    CHECK_THROWS_AS(run(), CLI::ParseError);
 }
 
 // #87
-TEST_F(TApp, CustomDoubleOption) {
+TEST_CASE_METHOD(TApp, "CustomDoubleOption", "[optiontype]") {
 
     std::pair<int, double> custom_opt;
 
@@ -494,12 +492,12 @@ TEST_F(TApp, CustomDoubleOption) {
     args = {"12", "1.5"};
 
     run();
-    EXPECT_EQ(custom_opt.first, 12);
-    EXPECT_DOUBLE_EQ(custom_opt.second, 1.5);
+    CHECK(12 == custom_opt.first);
+    CHECK(1.5 == Approx(custom_opt.second));
 }
 
 // now with tuple support this is possible
-TEST_F(TApp, CustomDoubleOptionAlt) {
+TEST_CASE_METHOD(TApp, "CustomDoubleOptionAlt", "[optiontype]") {
 
     std::pair<int, double> custom_opt;
 
@@ -508,12 +506,12 @@ TEST_F(TApp, CustomDoubleOptionAlt) {
     args = {"12", "1.5"};
 
     run();
-    EXPECT_EQ(custom_opt.first, 12);
-    EXPECT_DOUBLE_EQ(custom_opt.second, 1.5);
+    CHECK(12 == custom_opt.first);
+    CHECK(1.5 == Approx(custom_opt.second));
 }
 
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, vectorPair) {
+TEST_CASE_METHOD(TApp, "vectorPair", "[optiontype]") {
 
     std::vector<std::pair<int, std::string>> custom_opt;
 
@@ -522,21 +520,21 @@ TEST_F(TApp, vectorPair) {
     args = {"--dict", "1", "str1", "--dict", "3", "str3"};
 
     run();
-    ASSERT_EQ(custom_opt.size(), 2u);
-    EXPECT_EQ(custom_opt[0].first, 1);
-    EXPECT_EQ(custom_opt[1].second, "str3");
+    REQUIRE(2u == custom_opt.size());
+    CHECK(1 == custom_opt[0].first);
+    CHECK("str3" == custom_opt[1].second);
 
     args = {"--dict", "1", "str1", "--dict", "3", "str3", "--dict", "-1", "str4"};
     run();
-    ASSERT_EQ(custom_opt.size(), 3u);
-    EXPECT_EQ(custom_opt[2].first, -1);
-    EXPECT_EQ(custom_opt[2].second, "str4");
+    REQUIRE(3u == custom_opt.size());
+    CHECK(-1 == custom_opt[2].first);
+    CHECK("str4" == custom_opt[2].second);
     opt->check(CLI::PositiveNumber.application_index(0));
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, vectorPairFail) {
+TEST_CASE_METHOD(TApp, "vectorPairFail", "[optiontype]") {
 
     std::vector<std::pair<int, std::string>> custom_opt;
 
@@ -544,46 +542,46 @@ TEST_F(TApp, vectorPairFail) {
 
     args = {"--dict", "1", "str1", "--dict", "str3", "1"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, vectorPairTypeRange) {
+TEST_CASE_METHOD(TApp, "vectorPairTypeRange", "[optiontype]") {
 
     std::vector<std::pair<int, std::string>> custom_opt;
 
     auto opt = app.add_option("--dict", custom_opt);
 
     opt->type_size(2, 1);  // just test switched arguments
-    EXPECT_EQ(opt->get_type_size_min(), 1);
-    EXPECT_EQ(opt->get_type_size_max(), 2);
+    CHECK(1 == opt->get_type_size_min());
+    CHECK(2 == opt->get_type_size_max());
 
     args = {"--dict", "1", "str1", "--dict", "3", "str3"};
 
     run();
-    ASSERT_EQ(custom_opt.size(), 2u);
-    EXPECT_EQ(custom_opt[0].first, 1);
-    EXPECT_EQ(custom_opt[1].second, "str3");
+    REQUIRE(2u == custom_opt.size());
+    CHECK(1 == custom_opt[0].first);
+    CHECK("str3" == custom_opt[1].second);
 
     args = {"--dict", "1", "str1", "--dict", "3", "--dict", "-1", "str4"};
     run();
-    ASSERT_EQ(custom_opt.size(), 3u);
-    EXPECT_TRUE(custom_opt[1].second.empty());
-    EXPECT_EQ(custom_opt[2].first, -1);
-    EXPECT_EQ(custom_opt[2].second, "str4");
+    REQUIRE(3u == custom_opt.size());
+    CHECK(custom_opt[1].second.empty());
+    CHECK(-1 == custom_opt[2].first);
+    CHECK("str4" == custom_opt[2].second);
 
     opt->type_size(-2, -1);  // test negative arguments
-    EXPECT_EQ(opt->get_type_size_min(), 1);
-    EXPECT_EQ(opt->get_type_size_max(), 2);
+    CHECK(1 == opt->get_type_size_min());
+    CHECK(2 == opt->get_type_size_max());
     // this type size spec should run exactly as before
     run();
-    ASSERT_EQ(custom_opt.size(), 3u);
-    EXPECT_TRUE(custom_opt[1].second.empty());
-    EXPECT_EQ(custom_opt[2].first, -1);
-    EXPECT_EQ(custom_opt[2].second, "str4");
+    REQUIRE(3u == custom_opt.size());
+    CHECK(custom_opt[1].second.empty());
+    CHECK(-1 == custom_opt[2].first);
+    CHECK("str4" == custom_opt[2].second);
 }
 
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, vectorTuple) {
+TEST_CASE_METHOD(TApp, "vectorTuple", "[optiontype]") {
 
     std::vector<std::tuple<int, std::string, double>> custom_opt;
 
@@ -592,28 +590,28 @@ TEST_F(TApp, vectorTuple) {
     args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7"};
 
     run();
-    ASSERT_EQ(custom_opt.size(), 2u);
-    EXPECT_EQ(std::get<0>(custom_opt[0]), 1);
-    EXPECT_EQ(std::get<1>(custom_opt[1]), "str3");
-    EXPECT_EQ(std::get<2>(custom_opt[1]), 2.7);
+    REQUIRE(2u == custom_opt.size());
+    CHECK(1 == std::get<0>(custom_opt[0]));
+    CHECK("str3" == std::get<1>(custom_opt[1]));
+    CHECK(2.7 == std::get<2>(custom_opt[1]));
 
     args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7", "--dict", "-1", "str4", "-1.87"};
     run();
-    ASSERT_EQ(custom_opt.size(), 3u);
-    EXPECT_EQ(std::get<0>(custom_opt[2]), -1);
-    EXPECT_EQ(std::get<1>(custom_opt[2]), "str4");
-    EXPECT_EQ(std::get<2>(custom_opt[2]), -1.87);
+    REQUIRE(3u == custom_opt.size());
+    CHECK(-1 == std::get<0>(custom_opt[2]));
+    CHECK("str4" == std::get<1>(custom_opt[2]));
+    CHECK(-1.87 == std::get<2>(custom_opt[2]));
     opt->check(CLI::PositiveNumber.application_index(0));
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args.back() = "haha";
     args[9] = "45";
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, vectorVector) {
+TEST_CASE_METHOD(TApp, "vectorVector", "[optiontype]") {
 
     std::vector<std::vector<int>> custom_opt;
 
@@ -622,34 +620,34 @@ TEST_F(TApp, vectorVector) {
     args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
 
     run();
-    ASSERT_EQ(custom_opt.size(), 2u);
-    EXPECT_EQ(custom_opt[0].size(), 3u);
-    EXPECT_EQ(custom_opt[1].size(), 2u);
+    REQUIRE(2u == custom_opt.size());
+    CHECK(3u == custom_opt[0].size());
+    CHECK(2u == custom_opt[1].size());
 
     args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "--dict",
             "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
     run();
-    ASSERT_EQ(custom_opt.size(), 4u);
-    EXPECT_EQ(custom_opt[0].size(), 3u);
-    EXPECT_EQ(custom_opt[1].size(), 2u);
-    EXPECT_EQ(custom_opt[2].size(), 1u);
-    EXPECT_EQ(custom_opt[3].size(), 10u);
+    REQUIRE(4u == custom_opt.size());
+    CHECK(3u == custom_opt[0].size());
+    CHECK(2u == custom_opt[1].size());
+    CHECK(1u == custom_opt[2].size());
+    CHECK(10u == custom_opt[3].size());
     opt->check(CLI::PositiveNumber.application_index(9));
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args.pop_back();
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args.back() = "haha";
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     args = {"--dict", "1", "2", "4", "%%", "3", "1", "%%", "3", "%%", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3"};
     run();
-    ASSERT_EQ(custom_opt.size(), 4u);
+    REQUIRE(4u == custom_opt.size());
 }
 
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, vectorVectorFixedSize) {
+TEST_CASE_METHOD(TApp, "vectorVectorFixedSize", "[optiontype]") {
 
     std::vector<std::vector<int>> custom_opt;
 
@@ -658,21 +656,21 @@ TEST_F(TApp, vectorVectorFixedSize) {
     args = {"--dict", "1", "2", "4", "3", "--dict", "3", "1", "2", "8"};
 
     run();
-    ASSERT_EQ(custom_opt.size(), 2u);
-    EXPECT_EQ(custom_opt[0].size(), 4u);
-    EXPECT_EQ(custom_opt[1].size(), 4u);
+    REQUIRE(2u == custom_opt.size());
+    CHECK(4u == custom_opt[0].size());
+    CHECK(4u == custom_opt[1].size());
 
     args = {"--dict", "1", "2", "4", "--dict", "3", "1", "7", "6"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
     // this should reset it
     opt->type_size(CLI::detail::expected_max_vector_size);
     opt->type_size(1, CLI::detail::expected_max_vector_size);
-    EXPECT_NO_THROW(run());
-    ASSERT_EQ(custom_opt.size(), 2U);
+    CHECK_NOTHROW(run());
+    REQUIRE(2U == custom_opt.size());
 }
 
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, tuplePair) {
+TEST_CASE_METHOD(TApp, "tuplePair", "[optiontype]") {
     std::tuple<std::pair<int, double>> custom_opt;
 
     app.add_option("--pr", custom_opt);
@@ -680,11 +678,11 @@ TEST_F(TApp, tuplePair) {
     args = {"--pr", "1", "2"};
 
     run();
-    EXPECT_EQ(std::get<0>(custom_opt).first, 1);
-    EXPECT_EQ(std::get<0>(custom_opt).second, 2.0);
+    CHECK(1 == std::get<0>(custom_opt).first);
+    CHECK(2.0 == std::get<0>(custom_opt).second);
 }
 // now with independent type sizes and expected this is possible
-TEST_F(TApp, tupleintPair) {
+TEST_CASE_METHOD(TApp, "tupleintPair", "[optiontype]") {
     std::tuple<int, std::pair<int, double>> custom_opt;
 
     app.add_option("--pr", custom_opt);
@@ -692,9 +690,9 @@ TEST_F(TApp, tupleintPair) {
     args = {"--pr", "3", "1", "2"};
 
     run();
-    EXPECT_EQ(std::get<0>(custom_opt), 3);
-    EXPECT_EQ(std::get<1>(custom_opt).first, 1);
-    EXPECT_EQ(std::get<1>(custom_opt).second, 2.0);
+    CHECK(3 == std::get<0>(custom_opt));
+    CHECK(1 == std::get<1>(custom_opt).first);
+    CHECK(2.0 == std::get<1>(custom_opt).second);
 }
 
 static_assert(CLI::detail::is_mutable_container<std::set<std::string>>::value, "set should be a container");
@@ -719,66 +717,54 @@ static_assert(CLI::detail::type_count<std::list<std::pair<int, std::string>>>::v
 static_assert(CLI::detail::type_count<std::map<std::string, std::pair<int, std::string>>>::value == 3,
               "map<string,pair<int,string>> should have a type size of 3");
 
-template <class T> class TApp_container_single : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_single() : TApp() {}
-};
-
-using containerTypes_single =
-    ::testing::Types<std::vector<int>, std::deque<int>, std::set<int>, std::list<int>, std::unordered_set<int>>;
-
-TYPED_TEST_SUITE(TApp_container_single, containerTypes_single, );
-
-TYPED_TEST(TApp_container_single, containerInt) {
-
-    auto &cv = TApp_container_single<TypeParam>::cval;
-    CLI::Option *opt = (TApp::app).add_option("-v", cv);
-
-    TApp::args = {"-v", "1", "-1", "-v", "3", "-v", "-976"};
-    TApp::run();
-    EXPECT_EQ(4u, (TApp::app).count("-v"));
-    EXPECT_EQ(4u, cv.size());
+TEMPLATE_TEST_CASE("Container int single",
+                   "[optiontype]",
+                   std::vector<int>,
+                   std::deque<int>,
+                   std::set<int>,
+                   std::list<int>,
+                   std::unordered_set<int>) {
+    TApp tapp;
+    TestType cv;
+
+    CLI::Option *opt = tapp.app.add_option("-v", cv);
+
+    tapp.args = {"-v", "1", "-1", "-v", "3", "-v", "-976"};
+    tapp.run();
+    CHECK(tapp.app.count("-v") == 4u);
+    CHECK(cv.size() == 4u);
     opt->check(CLI::PositiveNumber.application_index(0));
     opt->check((!CLI::PositiveNumber).application_index(1));
-    EXPECT_NO_THROW(TApp::run());
-    EXPECT_EQ(4u, cv.size());
+    CHECK_NOTHROW(tapp.run());
+    CHECK(cv.size() == 4u);
     // v[3] would be negative
     opt->check(CLI::PositiveNumber.application_index(3));
-    EXPECT_THROW(TApp::run(), CLI::ValidationError);
+    CHECK_THROWS_AS(tapp.run(), CLI::ValidationError);
 }
 
-template <class T> class TApp_container_pair : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_pair() : TApp() {}
-};
-
 using isp = std::pair<int, std::string>;
-using containerTypes_pair = ::testing::Types<std::vector<isp>,
-                                             std::deque<isp>,
-                                             std::set<isp>,
-                                             std::list<isp>,
-                                             std::map<int, std::string>,
-                                             std::unordered_map<int, std::string>>;
 
-TYPED_TEST_SUITE(TApp_container_pair, containerTypes_pair, );
+TEMPLATE_TEST_CASE("Container pair",
+                   "[optiontype]",
+                   std::vector<isp>,
+                   std::deque<isp>,
+                   std::set<isp>,
+                   std::list<isp>,
+                   (std::map<int, std::string>),
+                   (std::unordered_map<int, std::string>)) {
+    TApp tapp;
+    TestType cv;
 
-TYPED_TEST(TApp_container_pair, containerPair) {
+    (tapp.app).add_option("--dict", cv);
 
-    auto &cv = TApp_container_pair<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
+    tapp.args = {"--dict", "1", "str1", "--dict", "3", "str3"};
 
-    TApp::args = {"--dict", "1", "str1", "--dict", "3", "str3"};
+    tapp.run();
+    CHECK(2u == cv.size());
 
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
-
-    TApp::args = {"--dict", "1", "str1", "--dict", "3", "--dict", "-1", "str4"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 3u);
+    tapp.args = {"--dict", "1", "str1", "--dict", "3", "--dict", "-1", "str4"};
+    tapp.run();
+    CHECK(3u == cv.size());
 }
 
 template <class T> class TApp_container_tuple : public TApp {
@@ -789,28 +775,28 @@ template <class T> class TApp_container_tuple : public TApp {
 };
 
 using tup_obj = std::tuple<int, std::string, double>;
-using containerTypes_tuple = ::testing::Types<std::vector<tup_obj>,
-                                              std::deque<tup_obj>,
-                                              std::set<tup_obj>,
-                                              std::list<tup_obj>,
-                                              std::map<int, std::pair<std::string, double>>,
-                                              std::unordered_map<int, std::tuple<std::string, double>>>;
-
-TYPED_TEST_SUITE(TApp_container_tuple, containerTypes_tuple, );
 
-TYPED_TEST(TApp_container_tuple, containerTuple) {
+TEMPLATE_TEST_CASE("Container tuple",
+                   "[optiontype]",
+                   std::vector<tup_obj>,
+                   std::deque<tup_obj>,
+                   std::set<tup_obj>,
+                   std::list<tup_obj>,
+                   (std::map<int, std::pair<std::string, double>>),
+                   (std::unordered_map<int, std::tuple<std::string, double>>)) {
+    TApp tapp;
+    TestType cv;
 
-    auto &cv = TApp_container_tuple<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
+    (tapp.app).add_option("--dict", cv);
 
-    TApp::args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7"};
+    tapp.args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7"};
 
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
+    tapp.run();
+    CHECK(2u == cv.size());
 
-    TApp::args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7", "--dict", "-1", "str4", "-1.87"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 3u);
+    tapp.args = {"--dict", "1", "str1", "4.3", "--dict", "3", "str3", "2.7", "--dict", "-1", "str4", "-1.87"};
+    tapp.run();
+    CHECK(3u == cv.size());
 }
 
 using icontainer1 = std::vector<int>;
@@ -818,45 +804,37 @@ using icontainer2 = std::list<int>;
 using icontainer3 = std::set<int>;
 using icontainer4 = std::pair<int, std::vector<int>>;
 
-using containerTypes_container = ::testing::Types<std::vector<icontainer1>,
-                                                  std::list<icontainer1>,
-                                                  std::set<icontainer1>,
-                                                  std::deque<icontainer1>,
-                                                  std::vector<icontainer2>,
-                                                  std::list<icontainer2>,
-                                                  std::set<icontainer2>,
-                                                  std::deque<icontainer2>,
-                                                  std::vector<icontainer3>,
-                                                  std::list<icontainer3>,
-                                                  std::set<icontainer3>,
-                                                  std::deque<icontainer3>>;
-
-template <class T> class TApp_container_container : public TApp {
-  public:
-    using container_type = T;
-    container_type cval{};
-    TApp_container_container() : TApp() {}
-};
-
-TYPED_TEST_SUITE(TApp_container_container, containerTypes_container, );
-
-TYPED_TEST(TApp_container_container, containerContainer) {
+TEMPLATE_TEST_CASE("Container container",
+                   "[optiontype]",
+                   std::vector<icontainer1>,
+                   std::list<icontainer1>,
+                   std::set<icontainer1>,
+                   std::deque<icontainer1>,
+                   std::vector<icontainer2>,
+                   std::list<icontainer2>,
+                   std::set<icontainer2>,
+                   std::deque<icontainer2>,
+                   std::vector<icontainer3>,
+                   std::list<icontainer3>,
+                   std::set<icontainer3>,
+                   std::deque<icontainer3>) {
+    TApp tapp;
+    TestType cv;
 
-    auto &cv = TApp_container_container<TypeParam>::cval;
-    (TApp::app).add_option("--dict", cv);
+    (tapp.app).add_option("--dict", cv);
 
-    TApp::args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
+    tapp.args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
 
-    TApp::run();
-    EXPECT_EQ(cv.size(), 2u);
+    tapp.run();
+    CHECK(2u == cv.size());
 
-    TApp::args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "--dict",
-                  "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
-    TApp::run();
-    EXPECT_EQ(cv.size(), 4u);
+    tapp.args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "--dict",
+                 "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
+    tapp.run();
+    CHECK(4u == cv.size());
 }
 
-TEST_F(TApp, containerContainer) {
+TEST_CASE_METHOD(TApp, "containerContainer", "[optiontype]") {
 
     std::vector<icontainer4> cv;
     app.add_option("--dict", cv);
@@ -864,15 +842,15 @@ TEST_F(TApp, containerContainer) {
     args = {"--dict", "1", "2", "4", "--dict", "3", "1"};
 
     run();
-    EXPECT_EQ(cv.size(), 2u);
+    CHECK(2u == cv.size());
 
     args = {"--dict", "1", "2", "4", "--dict", "3", "1", "--dict", "3", "",  "--dict",
             "3",      "3", "3", "3", "3",      "3", "3", "3",      "3", "-3"};
     run();
-    EXPECT_EQ(cv.size(), 4u);
+    CHECK(4u == cv.size());
 }
 
-TEST_F(TApp, unknownContainerWrapper) {
+TEST_CASE_METHOD(TApp, "unknownContainerWrapper", "[optiontype]") {
 
     class vopt {
       public:
@@ -887,14 +865,14 @@ TEST_F(TApp, unknownContainerWrapper) {
     args = {"--vv", "1", "2", "4"};
 
     run();
-    EXPECT_EQ(cv.val_.size(), 3u);
+    CHECK(3u == cv.val_.size());
     args = {"--vv", ""};
 
     run();
-    EXPECT_TRUE(cv.val_.empty());
+    CHECK(cv.val_.empty());
 }
 
-TEST_F(TApp, tupleTwoVectors) {
+TEST_CASE_METHOD(TApp, "tupleTwoVectors", "[optiontype]") {
 
     std::tuple<std::vector<int>, std::vector<int>> cv;
     app.add_option("--vv", cv);
@@ -902,17 +880,17 @@ TEST_F(TApp, tupleTwoVectors) {
     args = {"--vv", "1", "2", "4"};
 
     run();
-    EXPECT_EQ(std::get<0>(cv).size(), 3U);
-    EXPECT_TRUE(std::get<1>(cv).empty());
+    CHECK(3U == std::get<0>(cv).size());
+    CHECK(std::get<1>(cv).empty());
 
     args = {"--vv", "1", "2", "%%", "4", "4", "5"};
 
     run();
-    EXPECT_EQ(std::get<0>(cv).size(), 2U);
-    EXPECT_EQ(std::get<1>(cv).size(), 3U);
+    CHECK(2U == std::get<0>(cv).size());
+    CHECK(3U == std::get<1>(cv).size());
 }
 
-TEST_F(TApp, vectorSingleArg) {
+TEST_CASE_METHOD(TApp, "vectorSingleArg", "[optiontype]") {
 
     std::vector<int> cv;
     app.add_option("-c", cv)->allow_extra_args(false);
@@ -921,11 +899,11 @@ TEST_F(TApp, vectorSingleArg) {
     args = {"-c", "1", "-c", "2", "4"};
 
     run();
-    EXPECT_EQ(cv.size(), 2U);
-    EXPECT_EQ(extra, "4");
+    CHECK(2U == cv.size());
+    CHECK("4" == extra);
 }
 
-TEST_F(TApp, vectorDoubleArg) {
+TEST_CASE_METHOD(TApp, "vectorDoubleArg", "[optiontype]") {
 
     std::vector<std::pair<int, std::string>> cv;
     app.add_option("-c", cv)->allow_extra_args(false);
@@ -934,6 +912,6 @@ TEST_F(TApp, vectorDoubleArg) {
     args = {"-c", "1", "bob", "-c", "2", "apple", "4", "key"};
 
     run();
-    EXPECT_EQ(cv.size(), 2U);
-    EXPECT_EQ(extras.size(), 2U);
+    CHECK(2U == cv.size());
+    CHECK(2U == extras.size());
 }
diff --git a/packages/CLI11/tests/OptionalTest.cpp b/packages/CLI11/tests/OptionalTest.cpp
index 51f159aa69975b0dff22ed06ef315f0be8797e76..4d8da020b67b1f3531cce758d0eaa0e238bec745 100644
--- a/packages/CLI11/tests/OptionalTest.cpp
+++ b/packages/CLI11/tests/OptionalTest.cpp
@@ -62,70 +62,70 @@
 #pragma warning(disable : 4244)
 #endif
 
-TEST_F(TApp, StdOptionalTest) {
+TEST_CASE_METHOD(TApp, "StdOptionalTest", "[optional]") {
     std::optional<int> opt;
     app.add_option("-c,--count", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-c", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 1);
+    CHECK(opt);
+    CHECK(1 == *opt);
 
     args = {"--count", "3"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 3);
+    CHECK(opt);
+    CHECK(3 == *opt);
 }
 
-TEST_F(TApp, StdOptionalVectorEmptyDirect) {
+TEST_CASE_METHOD(TApp, "StdOptionalVectorEmptyDirect", "[optional]") {
     std::optional<std::vector<int>> opt;
     app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     // app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v"};
     opt = std::vector<int>{4, 3};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v", "1", "4", "5"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::vector<int> expV{1, 4, 5};
-    EXPECT_EQ(*opt, expV);
+    CHECK(expV == *opt);
 }
 
-TEST_F(TApp, StdOptionalComplexDirect) {
+TEST_CASE_METHOD(TApp, "StdOptionalComplexDirect", "[optional]") {
     std::optional<std::complex<double>> opt;
     app.add_option("-c,--complex", opt)->type_size(0, 2);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-c"};
     opt = std::complex<double>{4.0, 3.0};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-c", "1+2j"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::complex<double> val{1, 2};
-    EXPECT_EQ(*opt, val);
+    CHECK(val == *opt);
     args = {"-c", "3", "-4"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::complex<double> val2{3, -4};
-    EXPECT_EQ(*opt, val2);
+    CHECK(val2 == *opt);
 }
 
-TEST_F(TApp, StdOptionalUint) {
+TEST_CASE_METHOD(TApp, "StdOptionalUint", "[optional]") {
     std::optional<std::uint64_t> opt;
     app.add_option("-i,--int", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-i", "15"};
     run();
-    EXPECT_EQ(*opt, 15U);
+    CHECK(15U == *opt);
     static_assert(CLI::detail::classify_object<std::optional<std::uint64_t>>::value ==
                   CLI::detail::object_category::wrapper_value);
 }
@@ -137,97 +137,97 @@ TEST_F(TApp, StdOptionalUint) {
 #endif
 #if CLI11_EXPERIMENTAL_OPTIONAL
 
-TEST_F(TApp, ExperimentalOptionalTest) {
+TEST_CASE_METHOD(TApp, "ExperimentalOptionalTest", "[optional]") {
     std::experimental::optional<int> opt;
     app.add_option("-c,--count", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-c", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 1);
+    CHECK(opt);
+    CHECK(1 == *opt);
 
     args = {"--count", "3"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 3);
+    CHECK(opt);
+    CHECK(3 == *opt);
 }
 
 #endif
 #if CLI11_BOOST_OPTIONAL
 
-TEST_F(TApp, BoostOptionalTest) {
+TEST_CASE_METHOD(TApp, "BoostOptionalTest", "[optional]") {
     boost::optional<int> opt;
     app.add_option("-c,--count", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-c", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 1);
+    CHECK(opt);
+    CHECK(1 == *opt);
     opt = {};
     args = {"--count", "3"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 3);
+    CHECK(opt);
+    CHECK(3 == *opt);
 }
 
-TEST_F(TApp, BoostOptionalTestZarg) {
+TEST_CASE_METHOD(TApp, "BoostOptionalTestZarg", "[optional]") {
     boost::optional<int> opt;
     app.add_option("-c,--count", opt)->expected(0, 1);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-c", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 1);
+    CHECK(opt);
+    CHECK(1 == *opt);
     opt = {};
     args = {"--count"};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 }
 
-TEST_F(TApp, BoostOptionalint64Test) {
+TEST_CASE_METHOD(TApp, "BoostOptionalint64Test", "[optional]") {
     boost::optional<std::int64_t> opt;
     app.add_option("-c,--count", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-c", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 1);
+    CHECK(opt);
+    CHECK(1 == *opt);
     opt = {};
     args = {"--count", "3"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, 3);
+    CHECK(opt);
+    CHECK(3 == *opt);
 }
 
-TEST_F(TApp, BoostOptionalStringTest) {
+TEST_CASE_METHOD(TApp, "BoostOptionalStringTest", "[optional]") {
     boost::optional<std::string> opt;
     app.add_option("-s,--string", opt);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-s", "strval"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, "strval");
+    CHECK(opt);
+    CHECK("strval" == *opt);
     opt = {};
     args = {"--string", "strv"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_EQ(*opt, "strv");
+    CHECK(opt);
+    CHECK("strv" == *opt);
 }
 namespace boost {
 using CLI::enums::operator<<;
 }
 
-TEST_F(TApp, BoostOptionalEnumTest) {
+TEST_CASE_METHOD(TApp, "BoostOptionalEnumTest", "[optional]") {
 
     enum class eval : char { val0 = 0, val1 = 1, val2 = 2, val3 = 3, val4 = 4 };
     boost::optional<eval> opt, opt2;
@@ -237,93 +237,89 @@ TEST_F(TApp, BoostOptionalEnumTest) {
     optptr->capture_default_str();
 
     auto dstring = optptr->get_default_str();
-    EXPECT_TRUE(dstring.empty());
+    CHECK(dstring.empty());
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-v", "3"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_TRUE(*opt == eval::val3);
+    CHECK(opt);
+    CHECK(*opt == eval::val3);
     opt = {};
     args = {"--val", "1"};
     run();
-    EXPECT_TRUE(opt);
-    EXPECT_TRUE(*opt == eval::val1);
+    CHECK(opt);
+    CHECK(*opt == eval::val1);
 }
 
-TEST_F(TApp, BoostOptionalVector) {
+TEST_CASE_METHOD(TApp, "BoostOptionalVector", "[optional]") {
     boost::optional<std::vector<int>> opt;
     app.add_option_function<std::vector<int>>(
            "-v,--vec", [&opt](const std::vector<int> &v) { opt = v; }, "some vector")
         ->expected(3);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
 
     args = {"-v", "1", "4", "5"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::vector<int> expV{1, 4, 5};
-    EXPECT_EQ(*opt, expV);
+    CHECK(expV == *opt);
 }
 
-TEST_F(TApp, BoostOptionalVectorEmpty) {
+TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmpty", "[optional]") {
     boost::optional<std::vector<int>> opt;
     app.add_option<decltype(opt), std::vector<int>>("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     // app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v"};
     opt = std::vector<int>{4, 3};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v", "1", "4", "5"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::vector<int> expV{1, 4, 5};
-    EXPECT_EQ(*opt, expV);
+    CHECK(expV == *opt);
 }
 
-TEST_F(TApp, BoostOptionalVectorEmptyDirect) {
+TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmptyDirect", "[optional]") {
     boost::optional<std::vector<int>> opt;
     app.add_option_no_stream("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     // app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v"};
     opt = std::vector<int>{4, 3};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-v", "1", "4", "5"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::vector<int> expV{1, 4, 5};
-    EXPECT_EQ(*opt, expV);
+    CHECK(expV == *opt);
 }
 
-TEST_F(TApp, BoostOptionalComplexDirect) {
+TEST_CASE_METHOD(TApp, "BoostOptionalComplexDirect", "[optional]") {
     boost::optional<std::complex<double>> opt;
     app.add_option("-c,--complex", opt)->type_size(0, 2);
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-c"};
     opt = std::complex<double>{4.0, 3.0};
     run();
-    EXPECT_FALSE(opt);
+    CHECK(!opt);
     args = {"-c", "1+2j"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::complex<double> val{1, 2};
-    EXPECT_EQ(*opt, val);
+    CHECK(val == *opt);
     args = {"-c", "3", "-4"};
     run();
-    EXPECT_TRUE(opt);
+    CHECK(opt);
     std::complex<double> val2{3, -4};
-    EXPECT_EQ(*opt, val2);
+    CHECK(val2 == *opt);
 }
 
 #endif
-
-#if !CLI11_OPTIONAL
-TEST_F(TApp, DISABLED_OptionalTest) {}
-#endif
diff --git a/packages/CLI11/tests/SetTest.cpp b/packages/CLI11/tests/SetTest.cpp
index 7b6236edcbb13dfe0eab73200e11fd223daf57bf..d34a67a64f2163a7e1704fbc99a1952188611e37 100644
--- a/packages/CLI11/tests/SetTest.cpp
+++ b/packages/CLI11/tests/SetTest.cpp
@@ -31,79 +31,79 @@ static_assert(CLI::detail::pair_adaptor<std::vector<std::string>>::value == fals
 static_assert(CLI::detail::pair_adaptor<std::map<int, int>>::value == true, "Should have pairs");
 static_assert(CLI::detail::pair_adaptor<std::vector<std::pair<int, int>>>::value == true, "Should have pairs");
 
-TEST_F(TApp, SimpleMaps) {
+TEST_CASE_METHOD(TApp, "SimpleMaps", "[set]") {
     int value{0};
     std::map<std::string, int> map = {{"one", 1}, {"two", 2}};
     auto opt = app.add_option("-s,--set", value)->transform(CLI::Transformer(map));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, StringStringMap) {
+TEST_CASE_METHOD(TApp, "StringStringMap", "[set]") {
     std::string value;
     std::map<std::string, std::string> map = {{"a", "b"}, {"b", "c"}};
     app.add_option("-s,--set", value)->transform(CLI::CheckedTransformer(map));
     args = {"-s", "a"};
     run();
-    EXPECT_EQ(value, "b");
+    CHECK("b" == value);
 
     args = {"-s", "b"};
     run();
-    EXPECT_EQ(value, "c");
+    CHECK("c" == value);
 
     args = {"-s", "c"};
-    EXPECT_EQ(value, "c");
+    CHECK("c" == value);
 }
 
-TEST_F(TApp, StringStringMapNoModify) {
+TEST_CASE_METHOD(TApp, "StringStringMapNoModify", "[set]") {
     std::string value;
     std::map<std::string, std::string> map = {{"a", "b"}, {"b", "c"}};
     app.add_option("-s,--set", value)->check(CLI::IsMember(map));
     args = {"-s", "a"};
     run();
-    EXPECT_EQ(value, "a");
+    CHECK("a" == value);
 
     args = {"-s", "b"};
     run();
-    EXPECT_EQ(value, "b");
+    CHECK("b" == value);
 
     args = {"-s", "c"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
 enum SimpleEnum { SE_one = 1, SE_two = 2 };
 
-TEST_F(TApp, EnumMap) {
+TEST_CASE_METHOD(TApp, "EnumMap", "[set]") {
     SimpleEnum value;
     std::map<std::string, SimpleEnum> map = {{"one", SE_one}, {"two", SE_two}};
     auto opt = app.add_option("-s,--set", value)->transform(CLI::Transformer(map));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, SE_one);
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(SE_one == value);
 }
 
 enum class SimpleEnumC { one = 1, two = 2 };
 
-TEST_F(TApp, EnumCMap) {
+TEST_CASE_METHOD(TApp, "EnumCMap", "[set]") {
     SimpleEnumC value;
     std::map<std::string, SimpleEnumC> map = {{"one", SimpleEnumC::one}, {"two", SimpleEnumC::two}};
     auto opt = app.add_option("-s,--set", value)->transform(CLI::Transformer(map));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, SimpleEnumC::one);
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(SimpleEnumC::one == value);
 }
 
-TEST_F(TApp, structMap) {
+TEST_CASE_METHOD(TApp, "structMap", "[set]") {
     struct tstruct {
         int val2;
         double val3;
@@ -114,16 +114,16 @@ TEST_F(TApp, structMap) {
     auto opt = app.add_option("-s,--set", struct_name)->check(CLI::IsMember(map));
     args = {"-s", "sone"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(struct_name, "sone");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("sone" == struct_name);
 
     args = {"-s", "sthree"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, structMapChange) {
+TEST_CASE_METHOD(TApp, "structMapChange", "[set]") {
     struct tstruct {
         int val2;
         double val3;
@@ -135,23 +135,23 @@ TEST_F(TApp, structMapChange) {
                    ->transform(CLI::IsMember(map, CLI::ignore_case, CLI::ignore_underscore, CLI::ignore_space));
     args = {"-s", "s one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(struct_name, "sone");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("sone" == struct_name);
 
     args = {"-s", "sthree"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-s", "S_t_w_o"};
     run();
-    EXPECT_EQ(struct_name, "stwo");
+    CHECK("stwo" == struct_name);
     args = {"-s", "S two"};
     run();
-    EXPECT_EQ(struct_name, "stwo");
+    CHECK("stwo" == struct_name);
 }
 
-TEST_F(TApp, structMapNoChange) {
+TEST_CASE_METHOD(TApp, "structMapNoChange", "[set]") {
     struct tstruct {
         int val2;
         double val3;
@@ -163,24 +163,24 @@ TEST_F(TApp, structMapNoChange) {
                    ->check(CLI::IsMember(map, CLI::ignore_case, CLI::ignore_underscore, CLI::ignore_space));
     args = {"-s", "SONE"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(struct_name, "SONE");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("SONE" == struct_name);
 
     args = {"-s", "sthree"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-s", "S_t_w_o"};
     run();
-    EXPECT_EQ(struct_name, "S_t_w_o");
+    CHECK("S_t_w_o" == struct_name);
 
     args = {"-s", "S two"};
     run();
-    EXPECT_EQ(struct_name, "S two");
+    CHECK("S two" == struct_name);
 }
 
-TEST_F(TApp, NonCopyableMap) {
+TEST_CASE_METHOD(TApp, "NonCopyableMap", "[set]") {
 
     std::string map_name;
     std::map<std::string, std::unique_ptr<double>> map;
@@ -189,16 +189,16 @@ TEST_F(TApp, NonCopyableMap) {
     auto opt = app.add_option("-s,--set", map_name)->check(CLI::IsMember(&map));
     args = {"-s", "e1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(map_name, "e1");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("e1" == map_name);
 
     args = {"-s", "e45"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NonCopyableMapWithFunction) {
+TEST_CASE_METHOD(TApp, "NonCopyableMapWithFunction", "[set]") {
 
     std::string map_name;
     std::map<std::string, std::unique_ptr<double>> map;
@@ -207,16 +207,16 @@ TEST_F(TApp, NonCopyableMapWithFunction) {
     auto opt = app.add_option("-s,--set", map_name)->transform(CLI::IsMember(&map, CLI::ignore_underscore));
     args = {"-s", "e_1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(map_name, "e1");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("e1" == map_name);
 
     args = {"-s", "e45"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NonCopyableMapNonStringMap) {
+TEST_CASE_METHOD(TApp, "NonCopyableMapNonStringMap", "[set]") {
 
     std::string map_name;
     std::map<int, std::unique_ptr<double>> map;
@@ -225,16 +225,16 @@ TEST_F(TApp, NonCopyableMapNonStringMap) {
     auto opt = app.add_option("-s,--set", map_name)->check(CLI::IsMember(&map));
     args = {"-s", "4"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(map_name, "4");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("4" == map_name);
 
     args = {"-s", "e45"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, CopyableMapMove) {
+TEST_CASE_METHOD(TApp, "CopyableMapMove", "[set]") {
 
     std::string map_name;
     std::map<int, double> map;
@@ -243,162 +243,162 @@ TEST_F(TApp, CopyableMapMove) {
     auto opt = app.add_option("-s,--set", map_name)->check(CLI::IsMember(std::move(map)));
     args = {"-s", "4"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(map_name, "4");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("4" == map_name);
 
     args = {"-s", "e45"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, SimpleSets) {
+TEST_CASE_METHOD(TApp, "SimpleSets", "[set]") {
     std::string value;
     auto opt = app.add_option("-s,--set", value)->check(CLI::IsMember{std::set<std::string>({"one", "two", "three"})});
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, "one");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("one" == value);
 }
 
-TEST_F(TApp, SimpleSetsPtrs) {
+TEST_CASE_METHOD(TApp, "SimpleSetsPtrs", "[set]") {
     auto set = std::shared_ptr<std::set<std::string>>(new std::set<std::string>{"one", "two", "three"});
     std::string value;
     auto opt = app.add_option("-s,--set", value)->check(CLI::IsMember{set});
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, "one");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("one" == value);
 
     set->insert("four");
 
     args = {"-s", "four"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, "four");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("four" == value);
 }
 
-TEST_F(TApp, SimiShortcutSets) {
+TEST_CASE_METHOD(TApp, "SimiShortcutSets", "[set]") {
     std::string value;
     auto opt = app.add_option("--set", value)->check(CLI::IsMember({"one", "two", "three"}));
     args = {"--set", "one"};
     run();
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, "one");
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("one" == value);
 
     std::string value2;
     auto opt2 = app.add_option("--set2", value2)->transform(CLI::IsMember({"One", "two", "three"}, CLI::ignore_case));
     args = {"--set2", "onE"};
     run();
-    EXPECT_EQ(1u, app.count("--set2"));
-    EXPECT_EQ(1u, opt2->count());
-    EXPECT_EQ(value2, "One");
+    CHECK(app.count("--set2") == 1u);
+    CHECK(opt2->count() == 1u);
+    CHECK("One" == value2);
 
     std::string value3;
     auto opt3 = app.add_option("--set3", value3)
                     ->transform(CLI::IsMember({"O_ne", "two", "three"}, CLI::ignore_case, CLI::ignore_underscore));
     args = {"--set3", "onE"};
     run();
-    EXPECT_EQ(1u, app.count("--set3"));
-    EXPECT_EQ(1u, opt3->count());
-    EXPECT_EQ(value3, "O_ne");
+    CHECK(app.count("--set3") == 1u);
+    CHECK(opt3->count() == 1u);
+    CHECK("O_ne" == value3);
 }
 
-TEST_F(TApp, SetFromCharStarArrayVector) {
+TEST_CASE_METHOD(TApp, "SetFromCharStarArrayVector", "[set]") {
     constexpr const char *names[3]{"one", "two", "three"};
     std::string value;
     auto opt = app.add_option("-s,--set", value)
                    ->check(CLI::IsMember{std::vector<std::string>(std::begin(names), std::end(names))});
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, "one");
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK("one" == value);
 }
 
-TEST_F(TApp, OtherTypeSets) {
+TEST_CASE_METHOD(TApp, "OtherTypeSets", "[set]") {
     int value{0};
     std::vector<int> set = {2, 3, 4};
     auto opt = app.add_option("--set", value)->check(CLI::IsMember(set));
     args = {"--set", "3"};
     run();
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 3);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(3 == value);
 
     args = {"--set", "5"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     std::vector<int> set2 = {-2, 3, 4};
     auto opt2 = app.add_option("--set2", value)->transform(CLI::IsMember(set2, [](int x) { return std::abs(x); }));
     args = {"--set2", "-3"};
     run();
-    EXPECT_EQ(1u, app.count("--set2"));
-    EXPECT_EQ(1u, opt2->count());
-    EXPECT_EQ(value, 3);
+    CHECK(app.count("--set2") == 1u);
+    CHECK(opt2->count() == 1u);
+    CHECK(3 == value);
 
     args = {"--set2", "-3"};
     run();
-    EXPECT_EQ(1u, app.count("--set2"));
-    EXPECT_EQ(1u, opt2->count());
-    EXPECT_EQ(value, 3);
+    CHECK(app.count("--set2") == 1u);
+    CHECK(opt2->count() == 1u);
+    CHECK(3 == value);
 
     args = {"--set2", "2"};
     run();
-    EXPECT_EQ(1u, app.count("--set2"));
-    EXPECT_EQ(1u, opt2->count());
-    EXPECT_EQ(value, -2);
+    CHECK(app.count("--set2") == 1u);
+    CHECK(opt2->count() == 1u);
+    CHECK(-2 == value);
 }
 
-TEST_F(TApp, NumericalSets) {
+TEST_CASE_METHOD(TApp, "NumericalSets", "[set]") {
     int value{0};
     auto opt = app.add_option("-s,--set", value)->check(CLI::IsMember{std::set<int>({1, 2, 3})});
     args = {"-s", "1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, app.count("--set"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(app.count("--set") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
 // Converted original set tests
 
-TEST_F(TApp, SetWithDefaults) {
+TEST_CASE_METHOD(TApp, "SetWithDefaults", "[set]") {
     int someint{2};
     app.add_option("-a", someint, "", true)->check(CLI::IsMember({1, 2, 3, 4}));
 
     args = {"-a1", "-a2"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, SetWithDefaultsConversion) {
+TEST_CASE_METHOD(TApp, "SetWithDefaultsConversion", "[set]") {
     int someint{2};
     app.add_option("-a", someint, "", true)->check(CLI::IsMember({1, 2, 3, 4}));
 
     args = {"-a", "hi"};
 
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, SetWithDefaultsIC) {
+TEST_CASE_METHOD(TApp, "SetWithDefaultsIC", "[set]") {
     std::string someint = "ho";
     app.add_option("-a", someint, "", true)->check(CLI::IsMember({"Hi", "Ho"}));
 
     args = {"-aHi", "-aHo"};
 
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, InSet) {
+TEST_CASE_METHOD(TApp, "InSet", "[set]") {
 
     std::string choice;
     app.add_option("-q,--quick", choice)->check(CLI::IsMember({"one", "two", "three"}));
@@ -406,47 +406,47 @@ TEST_F(TApp, InSet) {
     args = {"--quick", "two"};
 
     run();
-    EXPECT_EQ("two", choice);
+    CHECK(choice == "two");
 
     args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InSetWithDefault) {
+TEST_CASE_METHOD(TApp, "InSetWithDefault", "[set]") {
 
     std::string choice = "one";
     app.add_option("-q,--quick", choice, "", true)->check(CLI::IsMember({"one", "two", "three"}));
 
     run();
-    EXPECT_EQ("one", choice);
+    CHECK(choice == "one");
 
     args = {"--quick", "two"};
 
     run();
-    EXPECT_EQ("two", choice);
+    CHECK(choice == "two");
 
     args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InCaselessSetWithDefault) {
+TEST_CASE_METHOD(TApp, "InCaselessSetWithDefault", "[set]") {
 
     std::string choice = "one";
     app.add_option("-q,--quick", choice, "", true)->transform(CLI::IsMember({"one", "two", "three"}, CLI::ignore_case));
 
     run();
-    EXPECT_EQ("one", choice);
+    CHECK(choice == "one");
 
     args = {"--quick", "tWo"};
 
     run();
-    EXPECT_EQ("two", choice);
+    CHECK(choice == "two");
 
     args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InIntSet) {
+TEST_CASE_METHOD(TApp, "InIntSet", "[set]") {
 
     int choice{0};
     app.add_option("-q,--quick", choice)->check(CLI::IsMember({1, 2, 3}));
@@ -454,13 +454,13 @@ TEST_F(TApp, InIntSet) {
     args = {"--quick", "2"};
 
     run();
-    EXPECT_EQ(2, choice);
+    CHECK(choice == 2);
 
     args = {"--quick", "4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InIntSetWindows) {
+TEST_CASE_METHOD(TApp, "InIntSetWindows", "[set]") {
 
     int choice{0};
     app.add_option("-q,--quick", choice)->check(CLI::IsMember({1, 2, 3}));
@@ -468,28 +468,28 @@ TEST_F(TApp, InIntSetWindows) {
     args = {"/q", "2"};
 
     run();
-    EXPECT_EQ(2, choice);
+    CHECK(choice == 2);
 
     args = {"/q", "4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"/q4"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, FailSet) {
+TEST_CASE_METHOD(TApp, "FailSet", "[set]") {
 
     int choice{0};
     app.add_option("-q,--quick", choice)->check(CLI::IsMember({1, 2, 3}));
 
     args = {"--quick", "3", "--quick=2"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 
     args = {"--quick=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, FailMutableSet) {
+TEST_CASE_METHOD(TApp, "FailMutableSet", "[set]") {
 
     int choice{0};
     auto vals = std::shared_ptr<std::set<int>>(new std::set<int>({1, 2, 3}));
@@ -497,37 +497,37 @@ TEST_F(TApp, FailMutableSet) {
     app.add_option("-s,--slow", choice, "", true)->check(CLI::IsMember(vals));
 
     args = {"--quick=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--slow=hello"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InSetIgnoreCase) {
+TEST_CASE_METHOD(TApp, "InSetIgnoreCase", "[set]") {
 
     std::string choice;
     app.add_option("-q,--quick", choice)->transform(CLI::IsMember({"one", "Two", "THREE"}, CLI::ignore_case));
 
     args = {"--quick", "One"};
     run();
-    EXPECT_EQ("one", choice);
+    CHECK(choice == "one");
 
     args = {"--quick", "two"};
     run();
-    EXPECT_EQ("Two", choice);  // Keeps caps from set
+    CHECK(choice == "Two");
 
     args = {"--quick", "ThrEE"};
     run();
-    EXPECT_EQ("THREE", choice);  // Keeps caps from set
+    CHECK(choice == "THREE");
 
     args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--quick=one", "--quick=two"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, InSetIgnoreCaseMutableValue) {
+TEST_CASE_METHOD(TApp, "InSetIgnoreCaseMutableValue", "[set]") {
 
     std::set<std::string> options{"one", "Two", "THREE"};
     std::string choice;
@@ -535,22 +535,22 @@ TEST_F(TApp, InSetIgnoreCaseMutableValue) {
 
     args = {"--quick", "One"};
     run();
-    EXPECT_EQ("one", choice);
+    CHECK(choice == "one");
 
     args = {"--quick", "two"};
     run();
-    EXPECT_EQ("Two", choice);  // Keeps caps from set
+    CHECK(choice == "Two");
 
     args = {"--quick", "ThrEE"};
     run();
-    EXPECT_EQ("THREE", choice);  // Keeps caps from set
+    CHECK(choice == "THREE");
 
     options.clear();
     args = {"--quick", "ThrEE"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, InSetIgnoreCasePointer) {
+TEST_CASE_METHOD(TApp, "InSetIgnoreCasePointer", "[set]") {
 
     std::set<std::string> *options = new std::set<std::string>{"one", "Two", "THREE"};
     std::string choice;
@@ -558,43 +558,43 @@ TEST_F(TApp, InSetIgnoreCasePointer) {
 
     args = {"--quick", "One"};
     run();
-    EXPECT_EQ("one", choice);
+    CHECK(choice == "one");
 
     args = {"--quick", "two"};
     run();
-    EXPECT_EQ("Two", choice);  // Keeps caps from set
+    CHECK(choice == "Two");
 
     args = {"--quick", "ThrEE"};
     run();
-    EXPECT_EQ("THREE", choice);  // Keeps caps from set
+    CHECK(choice == "THREE");
 
     delete options;
     args = {"--quick", "ThrEE"};
     run();
-    EXPECT_EQ("THREE", choice);  // this does not throw a segfault
+    CHECK(choice == "THREE");
 
     args = {"--quick", "four"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--quick=one", "--quick=two"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, NotInSetIgnoreCasePointer) {
+TEST_CASE_METHOD(TApp, "NotInSetIgnoreCasePointer", "[set]") {
 
     std::set<std::string> *options = new std::set<std::string>{"one", "Two", "THREE"};
     std::string choice;
     app.add_option("-q,--quick", choice)->check(!CLI::IsMember(*options, CLI::ignore_case));
 
     args = {"--quick", "One"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--quick", "four"};
     run();
-    EXPECT_EQ(choice, "four");
+    CHECK("four" == choice);
 }
 
-TEST_F(TApp, InSetIgnoreUnderscore) {
+TEST_CASE_METHOD(TApp, "InSetIgnoreUnderscore", "[set]") {
 
     std::string choice;
     app.add_option("-q,--quick", choice)
@@ -602,24 +602,24 @@ TEST_F(TApp, InSetIgnoreUnderscore) {
 
     args = {"--quick", "option_one"};
     run();
-    EXPECT_EQ("option_one", choice);
+    CHECK(choice == "option_one");
 
     args = {"--quick", "optiontwo"};
     run();
-    EXPECT_EQ("option_two", choice);  // Keeps underscore from set
+    CHECK(choice == "option_two");
 
     args = {"--quick", "_option_thr_ee"};
     run();
-    EXPECT_EQ("optionthree", choice);  // no underscore
+    CHECK(choice == "optionthree");
 
     args = {"--quick", "Option4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--quick=option_one", "--quick=option_two"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
-TEST_F(TApp, InSetIgnoreCaseUnderscore) {
+TEST_CASE_METHOD(TApp, "InSetIgnoreCaseUnderscore", "[set]") {
 
     std::string choice;
     app.add_option("-q,--quick", choice)
@@ -628,25 +628,25 @@ TEST_F(TApp, InSetIgnoreCaseUnderscore) {
 
     args = {"--quick", "option_one"};
     run();
-    EXPECT_EQ("Option_One", choice);
+    CHECK(choice == "Option_One");
 
     args = {"--quick", "OptionTwo"};
     run();
-    EXPECT_EQ("option_two", choice);  // Keeps underscore and case from set
+    CHECK(choice == "option_two");
 
     args = {"--quick", "_OPTION_thr_ee"};
     run();
-    EXPECT_EQ("OptionThree", choice);  // no underscore
+    CHECK(choice == "OptionThree");
 
     args = {"--quick", "Option4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--quick=option_one", "--quick=option_two"};
-    EXPECT_THROW(run(), CLI::ArgumentMismatch);
+    CHECK_THROWS_AS(run(), CLI::ArgumentMismatch);
 }
 
 // #113
-TEST_F(TApp, AddRemoveSetItems) {
+TEST_CASE_METHOD(TApp, "AddRemoveSetItems", "[set]") {
     std::set<std::string> items{"TYPE1", "TYPE2", "TYPE3", "TYPE4", "TYPE5"};
 
     std::string type1, type2;
@@ -656,8 +656,8 @@ TEST_F(TApp, AddRemoveSetItems) {
     args = {"--type1", "TYPE1", "--type2", "TYPE2"};
 
     run();
-    EXPECT_EQ(type1, "TYPE1");
-    EXPECT_EQ(type2, "TYPE2");
+    CHECK("TYPE1" == type1);
+    CHECK("TYPE2" == type2);
 
     items.insert("TYPE6");
     items.insert("TYPE7");
@@ -667,17 +667,17 @@ TEST_F(TApp, AddRemoveSetItems) {
 
     args = {"--type1", "TYPE6", "--type2", "TYPE7"};
     run();
-    EXPECT_EQ(type1, "TYPE6");
-    EXPECT_EQ(type2, "TYPE7");
+    CHECK("TYPE6" == type1);
+    CHECK("TYPE7" == type2);
 
     args = {"--type1", "TYPE1"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--type2", "TYPE2"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, AddRemoveSetItemsNoCase) {
+TEST_CASE_METHOD(TApp, "AddRemoveSetItemsNoCase", "[set]") {
     std::set<std::string> items{"TYPE1", "TYPE2", "TYPE3", "TYPE4", "TYPE5"};
 
     std::string type1, type2;
@@ -687,8 +687,8 @@ TEST_F(TApp, AddRemoveSetItemsNoCase) {
     args = {"--type1", "TYPe1", "--type2", "TyPE2"};
 
     run();
-    EXPECT_EQ(type1, "TYPE1");
-    EXPECT_EQ(type2, "TYPE2");
+    CHECK("TYPE1" == type1);
+    CHECK("TYPE2" == type2);
 
     items.insert("TYPE6");
     items.insert("TYPE7");
@@ -698,12 +698,12 @@ TEST_F(TApp, AddRemoveSetItemsNoCase) {
 
     args = {"--type1", "TyPE6", "--type2", "tYPE7"};
     run();
-    EXPECT_EQ(type1, "TYPE6");
-    EXPECT_EQ(type2, "TYPE7");
+    CHECK("TYPE6" == type1);
+    CHECK("TYPE7" == type2);
 
     args = {"--type1", "TYPe1"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"--type2", "TYpE2"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
diff --git a/packages/CLI11/tests/SimpleTest.cpp b/packages/CLI11/tests/SimpleTest.cpp
index dfcd5579127bf363a04893de6826fbb36ed345bd..92262b5e64ef7f30a65fe50d7a38592d33ac1809 100644
--- a/packages/CLI11/tests/SimpleTest.cpp
+++ b/packages/CLI11/tests/SimpleTest.cpp
@@ -10,11 +10,11 @@
 #include "CLI/CLI.hpp"
 #endif
 
-#include "gtest/gtest.h"
+#include "catch.hpp"
 
 using input_t = std::vector<std::string>;
 
-TEST(Basic, Empty) {
+TEST_CASE("Basic: Empty", "[simple]") {
 
     {
         CLI::App app;
@@ -24,7 +24,7 @@ TEST(Basic, Empty) {
     {
         CLI::App app;
         input_t spare = {"spare"};
-        EXPECT_THROW(app.parse(spare), CLI::ExtrasError);
+        CHECK_THROWS_AS(app.parse(spare), CLI::ExtrasError);
     }
     {
         CLI::App app;
diff --git a/packages/CLI11/tests/StringParseTest.cpp b/packages/CLI11/tests/StringParseTest.cpp
index 568bb4ecc45105db27fd51df415c70c379016cac..3a9f7371d87bd0412554eb49d2fe559ea060c95c 100644
--- a/packages/CLI11/tests/StringParseTest.cpp
+++ b/packages/CLI11/tests/StringParseTest.cpp
@@ -6,11 +6,10 @@
 
 #include "app_helper.hpp"
 
-#include "gmock/gmock.h"
 #include <cstdio>
 #include <sstream>
 
-TEST_F(TApp, ExistingExeCheck) {
+TEST_CASE_METHOD(TApp, "ExistingExeCheck", "[stringparse]") {
 
     TempFile tmpexe{"existingExe.out"};
 
@@ -27,12 +26,12 @@ TEST_F(TApp, ExistingExeCheck) {
     app.parse(std::string("./") + std::string(tmpexe) +
                   " --string=\"this is my quoted string\" -t 'qstring 2' -m=`\"quoted string\"`",
               true);
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 }
 
-TEST_F(TApp, ExistingExeCheckWithSpace) {
+TEST_CASE_METHOD(TApp, "ExistingExeCheckWithSpace", "[stringparse]") {
 
     TempFile tmpexe{"Space File.out"};
 
@@ -49,14 +48,14 @@ TEST_F(TApp, ExistingExeCheckWithSpace) {
     app.parse(std::string("./") + std::string(tmpexe) +
                   " --string=\"this is my quoted string\" -t 'qstring 2' -m=`\"quoted string\"`",
               true);
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 
-    EXPECT_EQ(app.get_name(), std::string("./") + std::string(tmpexe));
+    CHECK(std::string("./") + std::string(tmpexe) == app.get_name());
 }
 
-TEST_F(TApp, ExistingExeCheckWithLotsOfSpace) {
+TEST_CASE_METHOD(TApp, "ExistingExeCheckWithLotsOfSpace", "[stringparse]") {
 
     TempFile tmpexe{"this is a weird file.exe"};
 
@@ -73,9 +72,9 @@ TEST_F(TApp, ExistingExeCheckWithLotsOfSpace) {
     app.parse(std::string("./") + std::string(tmpexe) +
                   " --string=\"this is my quoted string\" -t 'qstring 2' -m=`\"quoted string\"`",
               true);
-    EXPECT_EQ(str, "this is my quoted string");
-    EXPECT_EQ(str2, "qstring 2");
-    EXPECT_EQ(str3, "\"quoted string\"");
+    CHECK("this is my quoted string" == str);
+    CHECK("qstring 2" == str2);
+    CHECK("\"quoted string\"" == str3);
 
-    EXPECT_EQ(app.get_name(), std::string("./") + std::string(tmpexe));
+    CHECK(std::string("./") + std::string(tmpexe) == app.get_name());
 }
diff --git a/packages/CLI11/tests/SubcommandTest.cpp b/packages/CLI11/tests/SubcommandTest.cpp
index e6f961345ed1f784f9bb0a02a02899b8a9c4ec1d..eef67bca7d44994678b3534dd8c3d155171a7479 100644
--- a/packages/CLI11/tests/SubcommandTest.cpp
+++ b/packages/CLI11/tests/SubcommandTest.cpp
@@ -6,59 +6,55 @@
 
 #include "app_helper.hpp"
 
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using ::testing::HasSubstr;
-using ::testing::Not;
+using Catch::Matchers::Contains;
 
 using vs_t = std::vector<std::string>;
 
-TEST_F(TApp, BasicSubcommands) {
+TEST_CASE_METHOD(TApp, "BasicSubcommands", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     auto sub2 = app.add_subcommand("sub2");
 
-    EXPECT_EQ(sub1->get_parent(), &app);
+    CHECK(&app == sub1->get_parent());
 
-    EXPECT_EQ(sub1, app.get_subcommand(sub1));
-    EXPECT_EQ(sub1, app.get_subcommand("sub1"));
-    EXPECT_THROW(app.get_subcommand("sub3"), CLI::OptionNotFound);
+    CHECK(app.get_subcommand(sub1) == sub1);
+    CHECK(app.get_subcommand("sub1") == sub1);
+    CHECK_THROWS_AS(app.get_subcommand("sub3"), CLI::OptionNotFound);
 
     run();
-    EXPECT_EQ(0u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().size() == 0u);
 
     args = {"sub1"};
     run();
-    EXPECT_EQ(sub1, app.get_subcommands().at(0));
-    EXPECT_EQ(1u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().at(0) == sub1);
+    CHECK(app.get_subcommands().size() == 1u);
 
     app.clear();
-    EXPECT_EQ(0u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().size() == 0u);
 
     args = {"sub2"};
     run();
-    EXPECT_EQ(1u, app.get_subcommands().size());
-    EXPECT_EQ(sub2, app.get_subcommands().at(0));
+    CHECK(app.get_subcommands().size() == 1u);
+    CHECK(app.get_subcommands().at(0) == sub2);
 
     args = {"SUb2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"SUb2"};
     try {
         run();
     } catch(const CLI::ExtrasError &e) {
-        EXPECT_THAT(e.what(), HasSubstr("SUb2"));
+        CHECK_THAT(e.what(), Contains("SUb2"));
     }
 
     args = {"sub1", "extra"};
     try {
         run();
     } catch(const CLI::ExtrasError &e) {
-        EXPECT_THAT(e.what(), HasSubstr("extra"));
+        CHECK_THAT(e.what(), Contains("extra"));
     }
 }
 
-TEST_F(TApp, MultiSubFallthrough) {
+TEST_CASE_METHOD(TApp, "MultiSubFallthrough", "[subcom]") {
 
     // No explicit fallthrough
     auto sub1 = app.add_subcommand("sub1");
@@ -66,15 +62,15 @@ TEST_F(TApp, MultiSubFallthrough) {
 
     args = {"sub1", "sub2"};
     run();
-    EXPECT_TRUE(app.got_subcommand("sub1"));
-    EXPECT_TRUE(app.got_subcommand(sub1));
-    EXPECT_TRUE(*sub1);
-    EXPECT_TRUE(sub1->parsed());
-    EXPECT_EQ(sub1->count(), 1u);
+    CHECK(app.got_subcommand("sub1"));
+    CHECK(app.got_subcommand(sub1));
+    CHECK(*sub1);
+    CHECK(sub1->parsed());
+    CHECK(1u == sub1->count());
 
-    EXPECT_TRUE(app.got_subcommand("sub2"));
-    EXPECT_TRUE(app.got_subcommand(sub2));
-    EXPECT_TRUE(*sub2);
+    CHECK(app.got_subcommand("sub2"));
+    CHECK(app.got_subcommand(sub2));
+    CHECK(*sub2);
 
     app.require_subcommand();
     run();
@@ -83,34 +79,34 @@ TEST_F(TApp, MultiSubFallthrough) {
     run();
 
     app.require_subcommand(1);
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"sub1"};
     run();
 
-    EXPECT_TRUE(app.got_subcommand("sub1"));
-    EXPECT_FALSE(app.got_subcommand("sub2"));
+    CHECK(app.got_subcommand("sub1"));
+    CHECK(!app.got_subcommand("sub2"));
 
-    EXPECT_TRUE(*sub1);
-    EXPECT_FALSE(*sub2);
-    EXPECT_FALSE(sub2->parsed());
-    EXPECT_EQ(sub2->count(), 0u);
+    CHECK(*sub1);
+    CHECK(!*sub2);
+    CHECK(!sub2->parsed());
+    CHECK(0u == sub2->count());
 
-    EXPECT_THROW(app.got_subcommand("sub3"), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.got_subcommand("sub3"), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, CrazyNameSubcommand) {
+TEST_CASE_METHOD(TApp, "CrazyNameSubcommand", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     // name can be set to whatever
-    EXPECT_NO_THROW(sub1->name("crazy name with spaces"));
+    CHECK_NOTHROW(sub1->name("crazy name with spaces"));
     args = {"crazy name with spaces"};
     run();
 
-    EXPECT_TRUE(app.got_subcommand("crazy name with spaces"));
-    EXPECT_EQ(sub1->count(), 1u);
+    CHECK(app.got_subcommand("crazy name with spaces"));
+    CHECK(1u == sub1->count());
 }
 
-TEST_F(TApp, RequiredAndSubcommands) {  // #23
+TEST_CASE_METHOD(TApp, "RequiredAndSubcommands", "[subcom]") {
 
     std::string baz;
     app.add_option("baz", baz, "Baz Description", true)->required();
@@ -118,26 +114,26 @@ TEST_F(TApp, RequiredAndSubcommands) {  // #23
     auto bar = app.add_subcommand("bar");
 
     args = {"bar", "foo"};
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(*foo);
-    EXPECT_FALSE(*bar);
-    EXPECT_EQ(baz, "bar");
+    REQUIRE_NOTHROW(run());
+    CHECK(*foo);
+    CHECK(!*bar);
+    CHECK("bar" == baz);
 
     args = {"foo"};
-    ASSERT_NO_THROW(run());
-    EXPECT_FALSE(*foo);
-    EXPECT_EQ(baz, "foo");
+    REQUIRE_NOTHROW(run());
+    CHECK(!*foo);
+    CHECK("foo" == baz);
 
     args = {"foo", "foo"};
-    ASSERT_NO_THROW(run());
-    EXPECT_TRUE(*foo);
-    EXPECT_EQ(baz, "foo");
+    REQUIRE_NOTHROW(run());
+    CHECK(*foo);
+    CHECK("foo" == baz);
 
     args = {"foo", "other"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, RequiredAndSubcomFallthrough) {
+TEST_CASE_METHOD(TApp, "RequiredAndSubcomFallthrough", "[subcom]") {
 
     std::string baz;
     app.add_option("baz", baz)->required();
@@ -147,14 +143,14 @@ TEST_F(TApp, RequiredAndSubcomFallthrough) {
 
     args = {"other", "bar"};
     run();
-    EXPECT_TRUE(bar);
-    EXPECT_EQ(baz, "other");
+    CHECK(bar);
+    CHECK("other" == baz);
 
     args = {"bar", "other2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, FooFooProblem) {
+TEST_CASE_METHOD(TApp, "FooFooProblem", "[subcom]") {
 
     std::string baz_str, other_str;
     auto baz = app.add_option("baz", baz_str);
@@ -163,55 +159,55 @@ TEST_F(TApp, FooFooProblem) {
 
     args = {"foo", "foo"};
     run();
-    EXPECT_TRUE(*foo);
-    EXPECT_FALSE(*baz);
-    EXPECT_TRUE(*other);
-    EXPECT_EQ(baz_str, "");
-    EXPECT_EQ(other_str, "foo");
+    CHECK(*foo);
+    CHECK(!*baz);
+    CHECK(*other);
+    CHECK("" == baz_str);
+    CHECK("foo" == other_str);
 
     baz_str = "";
     other_str = "";
     baz->required();
     run();
-    EXPECT_TRUE(*foo);
-    EXPECT_TRUE(*baz);
-    EXPECT_FALSE(*other);
-    EXPECT_EQ(baz_str, "foo");
-    EXPECT_EQ(other_str, "");
+    CHECK(*foo);
+    CHECK(*baz);
+    CHECK(!*other);
+    CHECK("foo" == baz_str);
+    CHECK("" == other_str);
 }
 
-TEST_F(TApp, DuplicateSubcommands) {
+TEST_CASE_METHOD(TApp, "DuplicateSubcommands", "[subcom]") {
 
     auto foo = app.add_subcommand("foo");
 
     args = {"foo", "foo"};
     run();
-    EXPECT_TRUE(*foo);
-    EXPECT_EQ(foo->count(), 2u);
+    CHECK(*foo);
+    CHECK(2u == foo->count());
 
     args = {"foo", "foo", "foo"};
     run();
-    EXPECT_TRUE(*foo);
-    EXPECT_EQ(foo->count(), 3u);
+    CHECK(*foo);
+    CHECK(3u == foo->count());
 }
 
-TEST_F(TApp, DuplicateSubcommandCallbacks) {
+TEST_CASE_METHOD(TApp, "DuplicateSubcommandCallbacks", "[subcom]") {
 
     auto foo = app.add_subcommand("foo");
     int count{0};
     foo->callback([&count]() { ++count; });
     foo->immediate_callback();
-    EXPECT_TRUE(foo->get_immediate_callback());
+    CHECK(foo->get_immediate_callback());
     args = {"foo", "foo"};
     run();
-    EXPECT_EQ(count, 2);
+    CHECK(2 == count);
     count = 0;
     args = {"foo", "foo", "foo"};
     run();
-    EXPECT_EQ(count, 3);
+    CHECK(3 == count);
 }
 
-TEST_F(TApp, DuplicateSubcommandCallbacksValues) {
+TEST_CASE_METHOD(TApp, "DuplicateSubcommandCallbacksValues", "[subcom]") {
 
     auto foo = app.add_subcommand("foo");
     int val{0};
@@ -221,19 +217,19 @@ TEST_F(TApp, DuplicateSubcommandCallbacksValues) {
     foo->immediate_callback();
     args = {"foo", "--val=45", "foo", "--val=27"};
     run();
-    EXPECT_EQ(vals.size(), 2u);
-    EXPECT_EQ(vals[0], 45);
-    EXPECT_EQ(vals[1], 27);
+    CHECK(2u == vals.size());
+    CHECK(45 == vals[0]);
+    CHECK(27 == vals[1]);
     vals.clear();
     args = {"foo", "--val=45", "foo", "--val=27", "foo", "--val=36"};
     run();
-    EXPECT_EQ(vals.size(), 3u);
-    EXPECT_EQ(vals[0], 45);
-    EXPECT_EQ(vals[1], 27);
-    EXPECT_EQ(vals[2], 36);
+    CHECK(3u == vals.size());
+    CHECK(45 == vals[0]);
+    CHECK(27 == vals[1]);
+    CHECK(36 == vals[2]);
 }
 
-TEST_F(TApp, Callbacks) {
+TEST_CASE_METHOD(TApp, "Callbacks", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     sub1->callback([]() { throw CLI::Success(); });
     auto sub2 = app.add_subcommand("sub2");
@@ -241,12 +237,12 @@ TEST_F(TApp, Callbacks) {
     sub2->callback([&val]() { val = true; });
 
     args = {"sub2"};
-    EXPECT_FALSE(val);
+    CHECK(!val);
     run();
-    EXPECT_TRUE(val);
+    CHECK(val);
 }
 
-TEST_F(TApp, CallbackOrder) {
+TEST_CASE_METHOD(TApp, "CallbackOrder", "[subcom]") {
 
     std::vector<std::string> cb;
     app.parse_complete_callback([&cb]() { cb.push_back("ac1"); });
@@ -281,18 +277,18 @@ TEST_F(TApp, CallbackOrder) {
             "--sub2opt2",
             "val"};
     run();
-    EXPECT_EQ(cb.size(), 8u);
-    EXPECT_EQ(cb[0], "pa-13");
-    EXPECT_EQ(cb[1], "pc1-10");
-    EXPECT_EQ(cb[2], "c1");
-    EXPECT_EQ(cb[3], "pc2-6");
-    EXPECT_EQ(cb[4], "c1");
-    EXPECT_EQ(cb[5], "ac1");
-    EXPECT_EQ(cb[6], "c2");
-    EXPECT_EQ(cb[7], "ac2");
+    CHECK(8u == cb.size());
+    CHECK("pa-13" == cb[0]);
+    CHECK("pc1-10" == cb[1]);
+    CHECK("c1" == cb[2]);
+    CHECK("pc2-6" == cb[3]);
+    CHECK("c1" == cb[4]);
+    CHECK("ac1" == cb[5]);
+    CHECK("c2" == cb[6]);
+    CHECK("ac2" == cb[7]);
 }
 
-TEST_F(TApp, CallbackOrder2) {
+TEST_CASE_METHOD(TApp, "CallbackOrder2", "[subcom]") {
 
     std::vector<std::string> cb;
     app.add_subcommand("sub1")->parse_complete_callback([&cb]() { cb.push_back("sub1"); });
@@ -301,17 +297,17 @@ TEST_F(TApp, CallbackOrder2) {
 
     args = {"sub1", "sub2", "sub3", "sub1", "sub1", "sub2", "sub1"};
     run();
-    EXPECT_EQ(cb.size(), 7u);
-    EXPECT_EQ(cb[0], "sub1");
-    EXPECT_EQ(cb[1], "sub2");
-    EXPECT_EQ(cb[2], "sub3");
-    EXPECT_EQ(cb[3], "sub1");
-    EXPECT_EQ(cb[4], "sub1");
-    EXPECT_EQ(cb[5], "sub2");
-    EXPECT_EQ(cb[6], "sub1");
+    CHECK(7u == cb.size());
+    CHECK("sub1" == cb[0]);
+    CHECK("sub2" == cb[1]);
+    CHECK("sub3" == cb[2]);
+    CHECK("sub1" == cb[3]);
+    CHECK("sub1" == cb[4]);
+    CHECK("sub2" == cb[5]);
+    CHECK("sub1" == cb[6]);
 }
 
-TEST_F(TApp, CallbackOrder2_withFallthrough) {
+TEST_CASE_METHOD(TApp, "CallbackOrder2_withFallthrough", "[subcom]") {
 
     std::vector<std::string> cb;
 
@@ -321,64 +317,64 @@ TEST_F(TApp, CallbackOrder2_withFallthrough) {
 
     args = {"sub1", "sub2", "sub3", "sub1", "sub1", "sub2", "sub1"};
     run();
-    EXPECT_EQ(cb.size(), 7u);
-    EXPECT_EQ(cb[0], "sub1");
-    EXPECT_EQ(cb[1], "sub2");
-    EXPECT_EQ(cb[2], "sub3");
-    EXPECT_EQ(cb[3], "sub1");
-    EXPECT_EQ(cb[4], "sub1");
-    EXPECT_EQ(cb[5], "sub2");
-    EXPECT_EQ(cb[6], "sub1");
+    CHECK(7u == cb.size());
+    CHECK("sub1" == cb[0]);
+    CHECK("sub2" == cb[1]);
+    CHECK("sub3" == cb[2]);
+    CHECK("sub1" == cb[3]);
+    CHECK("sub1" == cb[4]);
+    CHECK("sub2" == cb[5]);
+    CHECK("sub1" == cb[6]);
 }
 
-TEST_F(TApp, RuntimeErrorInCallback) {
+TEST_CASE_METHOD(TApp, "RuntimeErrorInCallback", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     sub1->callback([]() { throw CLI::RuntimeError(); });
     auto sub2 = app.add_subcommand("sub2");
     sub2->callback([]() { throw CLI::RuntimeError(2); });
 
     args = {"sub1"};
-    EXPECT_THROW(run(), CLI::RuntimeError);
+    CHECK_THROWS_AS(run(), CLI::RuntimeError);
 
     args = {"sub1"};
     try {
         run();
     } catch(const CLI::RuntimeError &e) {
-        EXPECT_EQ(1, e.get_exit_code());
+        CHECK(e.get_exit_code() == 1);
     }
 
     args = {"sub2"};
-    EXPECT_THROW(run(), CLI::RuntimeError);
+    CHECK_THROWS_AS(run(), CLI::RuntimeError);
 
     args = {"sub2"};
     try {
         run();
     } catch(const CLI::RuntimeError &e) {
-        EXPECT_EQ(2, e.get_exit_code());
+        CHECK(e.get_exit_code() == 2);
     }
 }
 
-TEST_F(TApp, NoFallThroughOpts) {
+TEST_CASE_METHOD(TApp, "NoFallThroughOpts", "[subcom]") {
     int val{1};
     app.add_option("--val", val);
 
     app.add_subcommand("sub");
 
     args = {"sub", "--val", "2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, NoFallThroughPositionals) {
+TEST_CASE_METHOD(TApp, "NoFallThroughPositionals", "[subcom]") {
     int val{1};
     app.add_option("val", val);
 
     app.add_subcommand("sub");
 
     args = {"sub", "2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, NoFallThroughOptsWithTerminator) {
+TEST_CASE_METHOD(TApp, "NoFallThroughOptsWithTerminator", "[subcom]") {
     int val{1};
     app.add_option("--val", val);
 
@@ -386,10 +382,10 @@ TEST_F(TApp, NoFallThroughOptsWithTerminator) {
 
     args = {"sub", "++", "--val", "2"};
     run();
-    EXPECT_EQ(val, 2);
+    CHECK(2 == val);
 }
 
-TEST_F(TApp, NoFallThroughPositionalsWithTerminator) {
+TEST_CASE_METHOD(TApp, "NoFallThroughPositionalsWithTerminator", "[subcom]") {
     int val{1};
     app.add_option("val", val);
 
@@ -397,15 +393,15 @@ TEST_F(TApp, NoFallThroughPositionalsWithTerminator) {
 
     args = {"sub", "++", "2"};
     run();
-    EXPECT_EQ(val, 2);
+    CHECK(2 == val);
 
     // try with positional only mark
     args = {"sub", "--", "3"};
     run();
-    EXPECT_EQ(val, 3);
+    CHECK(3 == val);
 }
 
-TEST_F(TApp, NamelessSubComPositionals) {
+TEST_CASE_METHOD(TApp, "NamelessSubComPositionals", "[subcom]") {
 
     auto sub = app.add_subcommand();
     int val{1};
@@ -413,21 +409,21 @@ TEST_F(TApp, NamelessSubComPositionals) {
 
     args = {"2"};
     run();
-    EXPECT_EQ(val, 2);
+    CHECK(2 == val);
 }
 
-TEST_F(TApp, NamelessSubWithSub) {
+TEST_CASE_METHOD(TApp, "NamelessSubWithSub", "[subcom]") {
 
     auto sub = app.add_subcommand();
     auto subsub = sub->add_subcommand("val");
 
     args = {"val"};
     run();
-    EXPECT_TRUE(subsub->parsed());
-    EXPECT_TRUE(app.got_subcommand("val"));
+    CHECK(subsub->parsed());
+    CHECK(app.got_subcommand("val"));
 }
 
-TEST_F(TApp, NamelessSubWithMultipleSub) {
+TEST_CASE_METHOD(TApp, "NamelessSubWithMultipleSub", "[subcom]") {
 
     auto sub1 = app.add_subcommand();
     auto sub2 = app.add_subcommand();
@@ -437,33 +433,33 @@ TEST_F(TApp, NamelessSubWithMultipleSub) {
     auto sub2sub2 = sub2->add_subcommand("val4");
     args = {"val1"};
     run();
-    EXPECT_TRUE(sub1sub1->parsed());
-    EXPECT_TRUE(app.got_subcommand("val1"));
+    CHECK(sub1sub1->parsed());
+    CHECK(app.got_subcommand("val1"));
 
     args = {"val2"};
     run();
-    EXPECT_TRUE(sub1sub2->parsed());
-    EXPECT_TRUE(app.got_subcommand("val2"));
+    CHECK(sub1sub2->parsed());
+    CHECK(app.got_subcommand("val2"));
 
     args = {"val3"};
     run();
-    EXPECT_TRUE(sub2sub1->parsed());
-    EXPECT_TRUE(app.got_subcommand("val3"));
+    CHECK(sub2sub1->parsed());
+    CHECK(app.got_subcommand("val3"));
 
     args = {"val4"};
     run();
-    EXPECT_TRUE(sub2sub2->parsed());
-    EXPECT_TRUE(app.got_subcommand("val4"));
+    CHECK(sub2sub2->parsed());
+    CHECK(app.got_subcommand("val4"));
 
     args = {"val4", "val1"};
     run();
-    EXPECT_TRUE(sub2sub2->parsed());
-    EXPECT_TRUE(app.got_subcommand("val4"));
-    EXPECT_TRUE(sub1sub1->parsed());
-    EXPECT_TRUE(app.got_subcommand("val1"));
+    CHECK(sub2sub2->parsed());
+    CHECK(app.got_subcommand("val4"));
+    CHECK(sub1sub1->parsed());
+    CHECK(app.got_subcommand("val1"));
 }
 
-TEST_F(TApp, Nameless4LayerDeep) {
+TEST_CASE_METHOD(TApp, "Nameless4LayerDeep", "[subcom]") {
 
     auto sub = app.add_subcommand();
     auto ssub = sub->add_subcommand();
@@ -474,12 +470,12 @@ TEST_F(TApp, Nameless4LayerDeep) {
 
     args = {"val"};
     run();
-    EXPECT_TRUE(sssssub->parsed());
-    EXPECT_TRUE(app.got_subcommand("val"));
+    CHECK(sssssub->parsed());
+    CHECK(app.got_subcommand("val"));
 }
 
 /// Put subcommands in some crazy pattern and make everything still works
-TEST_F(TApp, Nameless4LayerDeepMulti) {
+TEST_CASE_METHOD(TApp, "Nameless4LayerDeepMulti", "[subcom]") {
 
     auto sub1 = app.add_subcommand();
     auto sub2 = app.add_subcommand();
@@ -495,31 +491,31 @@ TEST_F(TApp, Nameless4LayerDeepMulti) {
     sssub2->add_subcommand("val5");
     args = {"val1"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val1"));
+    CHECK(app.got_subcommand("val1"));
 
     args = {"val2"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val2"));
+    CHECK(app.got_subcommand("val2"));
 
     args = {"val3"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val3"));
+    CHECK(app.got_subcommand("val3"));
 
     args = {"val4"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val4"));
+    CHECK(app.got_subcommand("val4"));
     args = {"val5"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val5"));
+    CHECK(app.got_subcommand("val5"));
 
     args = {"val4", "val1", "val5"};
     run();
-    EXPECT_TRUE(app.got_subcommand("val4"));
-    EXPECT_TRUE(app.got_subcommand("val1"));
-    EXPECT_TRUE(app.got_subcommand("val5"));
+    CHECK(app.got_subcommand("val4"));
+    CHECK(app.got_subcommand("val1"));
+    CHECK(app.got_subcommand("val5"));
 }
 
-TEST_F(TApp, FallThroughRegular) {
+TEST_CASE_METHOD(TApp, "FallThroughRegular", "[subcom]") {
     app.fallthrough();
     int val{1};
     app.add_option("--val", val);
@@ -531,7 +527,7 @@ TEST_F(TApp, FallThroughRegular) {
     run();
 }
 
-TEST_F(TApp, FallThroughShort) {
+TEST_CASE_METHOD(TApp, "FallThroughShort", "[subcom]") {
     app.fallthrough();
     int val{1};
     app.add_option("-v", val);
@@ -543,7 +539,7 @@ TEST_F(TApp, FallThroughShort) {
     run();
 }
 
-TEST_F(TApp, FallThroughPositional) {
+TEST_CASE_METHOD(TApp, "FallThroughPositional", "[subcom]") {
     app.fallthrough();
     int val{1};
     app.add_option("val", val);
@@ -555,7 +551,7 @@ TEST_F(TApp, FallThroughPositional) {
     run();
 }
 
-TEST_F(TApp, FallThroughEquals) {
+TEST_CASE_METHOD(TApp, "FallThroughEquals", "[subcom]") {
     app.fallthrough();
     int val{1};
     app.add_option("--val", val);
@@ -567,7 +563,7 @@ TEST_F(TApp, FallThroughEquals) {
     run();
 }
 
-TEST_F(TApp, EvilParseFallthrough) {
+TEST_CASE_METHOD(TApp, "EvilParseFallthrough", "[subcom]") {
     app.fallthrough();
     int val1{0}, val2{0};
     app.add_option("--val1", val1);
@@ -579,11 +575,11 @@ TEST_F(TApp, EvilParseFallthrough) {
     // Should not throw
     run();
 
-    EXPECT_EQ(1, val1);
-    EXPECT_EQ(2, val2);
+    CHECK(val1 == 1);
+    CHECK(val2 == 2);
 }
 
-TEST_F(TApp, CallbackOrdering) {
+TEST_CASE_METHOD(TApp, "CallbackOrdering", "[subcom]") {
     app.fallthrough();
     int val{1}, sub_val{0};
     app.add_option("--val", val);
@@ -593,16 +589,16 @@ TEST_F(TApp, CallbackOrdering) {
 
     args = {"sub", "--val=2"};
     run();
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(2, sub_val);
+    CHECK(val == 2);
+    CHECK(sub_val == 2);
 
     args = {"--val=2", "sub"};
     run();
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(2, sub_val);
+    CHECK(val == 2);
+    CHECK(sub_val == 2);
 }
 
-TEST_F(TApp, CallbackOrderingImmediate) {
+TEST_CASE_METHOD(TApp, "CallbackOrderingImmediate", "[subcom]") {
     app.fallthrough();
     int val{1}, sub_val{0};
     app.add_option("--val", val);
@@ -612,16 +608,16 @@ TEST_F(TApp, CallbackOrderingImmediate) {
 
     args = {"sub", "--val=2"};
     run();
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(1, sub_val);
+    CHECK(val == 2);
+    CHECK(sub_val == 1);
 
     args = {"--val=2", "sub"};
     run();
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(2, sub_val);
+    CHECK(val == 2);
+    CHECK(sub_val == 2);
 }
 
-TEST_F(TApp, CallbackOrderingImmediateMain) {
+TEST_CASE_METHOD(TApp, "CallbackOrderingImmediateMain", "[subcom]") {
     app.fallthrough();
     int val{0}, sub_val{0};
 
@@ -633,24 +629,24 @@ TEST_F(TApp, CallbackOrderingImmediateMain) {
     app.callback([&val]() { val = 1; });
     args = {"sub"};
     run();
-    EXPECT_EQ(1, val);
-    EXPECT_EQ(0, sub_val);
+    CHECK(val == 1);
+    CHECK(sub_val == 0);
     // the main app callback should run before the subcommand callbacks
     app.immediate_callback();
     val = 0;  // reset value
     run();
-    EXPECT_EQ(2, val);
-    EXPECT_EQ(1, sub_val);
+    CHECK(val == 2);
+    CHECK(sub_val == 1);
     // the subcommand callback now runs immediately after processing and before the main app callback again
     sub->immediate_callback();
     val = 0;  // reset value
     run();
-    EXPECT_EQ(1, val);
-    EXPECT_EQ(0, sub_val);
+    CHECK(val == 1);
+    CHECK(sub_val == 0);
 }
 
 // Test based on issue #308
-TEST_F(TApp, CallbackOrderingImmediateModeOrder) {
+TEST_CASE_METHOD(TApp, "CallbackOrderingImmediateModeOrder", "[subcom]") {
 
     app.require_subcommand(1, 1);
     std::vector<int> v;
@@ -660,81 +656,81 @@ TEST_F(TApp, CallbackOrderingImmediateModeOrder) {
     args = {"hello"};
     run();
     // immediate_callback inherited
-    ASSERT_EQ(v.size(), 2u);
-    EXPECT_EQ(v[0], 1);
-    EXPECT_EQ(v[1], 2);
+    REQUIRE(2u == v.size());
+    CHECK(1 == v[0]);
+    CHECK(2 == v[1]);
     v.clear();
     sub->immediate_callback(true);
     run();
     // immediate_callback is now triggered for the main first
-    ASSERT_EQ(v.size(), 2u);
-    EXPECT_EQ(v[0], 2);
-    EXPECT_EQ(v[1], 1);
+    REQUIRE(2u == v.size());
+    CHECK(2 == v[0]);
+    CHECK(1 == v[1]);
 }
 
-TEST_F(TApp, RequiredSubCom) {
+TEST_CASE_METHOD(TApp, "RequiredSubCom", "[subcom]") {
     app.add_subcommand("sub1");
     app.add_subcommand("sub2");
 
     app.require_subcommand();
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"sub1"};
     run();
 }
 
-TEST_F(TApp, SubComExtras) {
+TEST_CASE_METHOD(TApp, "SubComExtras", "[subcom]") {
     app.allow_extras();
     auto sub = app.add_subcommand("sub");
 
     args = {"extra", "sub"};
     run();
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"extra"}));
-    EXPECT_EQ(sub->remaining(), std::vector<std::string>());
+    CHECK(std::vector<std::string>({"extra"}) == app.remaining());
+    CHECK(std::vector<std::string>() == sub->remaining());
 
     args = {"extra1", "extra2", "sub"};
     run();
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"extra1", "extra2"}));
-    EXPECT_EQ(sub->remaining(), std::vector<std::string>());
+    CHECK(std::vector<std::string>({"extra1", "extra2"}) == app.remaining());
+    CHECK(std::vector<std::string>() == sub->remaining());
 
     args = {"sub", "extra1", "extra2"};
     run();
-    EXPECT_EQ(app.remaining(), std::vector<std::string>());
-    EXPECT_EQ(sub->remaining(), std::vector<std::string>({"extra1", "extra2"}));
+    CHECK(std::vector<std::string>() == app.remaining());
+    CHECK(std::vector<std::string>({"extra1", "extra2"}) == sub->remaining());
 
     args = {"extra1", "extra2", "sub", "extra3", "extra4"};
     run();
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"extra1", "extra2"}));
-    EXPECT_EQ(app.remaining(true), std::vector<std::string>({"extra1", "extra2", "extra3", "extra4"}));
-    EXPECT_EQ(sub->remaining(), std::vector<std::string>({"extra3", "extra4"}));
+    CHECK(std::vector<std::string>({"extra1", "extra2"}) == app.remaining());
+    CHECK(std::vector<std::string>({"extra1", "extra2", "extra3", "extra4"}) == app.remaining(true));
+    CHECK(std::vector<std::string>({"extra3", "extra4"}) == sub->remaining());
 }
 
-TEST_F(TApp, Required1SubCom) {
+TEST_CASE_METHOD(TApp, "Required1SubCom", "[subcom]") {
     app.require_subcommand(1);
     app.add_subcommand("sub1");
     app.add_subcommand("sub2");
     app.add_subcommand("sub3");
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 
     args = {"sub1"};
     run();
 
     args = {"sub1", "sub2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, BadSubcommandSearch) {
+TEST_CASE_METHOD(TApp, "BadSubcommandSearch", "[subcom]") {
 
     auto one = app.add_subcommand("one");
     auto two = one->add_subcommand("two");
 
-    EXPECT_THROW(app.get_subcommand(two), CLI::OptionNotFound);
-    EXPECT_THROW(app.get_subcommand_ptr(two), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.get_subcommand(two), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.get_subcommand_ptr(two), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, PrefixProgram) {
+TEST_CASE_METHOD(TApp, "PrefixProgram", "[subcom]") {
 
     app.prefix_command();
 
@@ -743,10 +739,10 @@ TEST_F(TApp, PrefixProgram) {
     args = {"--simple", "other", "--simple", "--mine"};
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"other", "--simple", "--mine"}));
+    CHECK(std::vector<std::string>({"other", "--simple", "--mine"}) == app.remaining());
 }
 
-TEST_F(TApp, PrefixNoSeparation) {
+TEST_CASE_METHOD(TApp, "PrefixNoSeparation", "[subcom]") {
 
     app.prefix_command();
 
@@ -755,10 +751,10 @@ TEST_F(TApp, PrefixNoSeparation) {
 
     args = {"--vals", "1", "2", "3", "other"};
 
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, PrefixSeparation) {
+TEST_CASE_METHOD(TApp, "PrefixSeparation", "[subcom]") {
 
     app.prefix_command();
 
@@ -769,11 +765,11 @@ TEST_F(TApp, PrefixSeparation) {
 
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"other"}));
-    EXPECT_EQ(vals, std::vector<int>({1, 2, 3}));
+    CHECK(std::vector<std::string>({"other"}) == app.remaining());
+    CHECK(std::vector<int>({1, 2, 3}) == vals);
 }
 
-TEST_F(TApp, PrefixSubcom) {
+TEST_CASE_METHOD(TApp, "PrefixSubcom", "[subcom]") {
     auto subc = app.add_subcommand("subc");
     subc->prefix_command();
 
@@ -782,19 +778,19 @@ TEST_F(TApp, PrefixSubcom) {
     args = {"--simple", "subc", "other", "--simple", "--mine"};
     run();
 
-    EXPECT_EQ(app.remaining_size(), 0u);
-    EXPECT_EQ(app.remaining_size(true), 3u);
-    EXPECT_EQ(subc->remaining(), std::vector<std::string>({"other", "--simple", "--mine"}));
+    CHECK(0u == app.remaining_size());
+    CHECK(3u == app.remaining_size(true));
+    CHECK(std::vector<std::string>({"other", "--simple", "--mine"}) == subc->remaining());
 }
 
-TEST_F(TApp, InheritHelpAllFlag) {
+TEST_CASE_METHOD(TApp, "InheritHelpAllFlag", "[subcom]") {
     app.set_help_all_flag("--help-all");
     auto subc = app.add_subcommand("subc");
     auto help_opt_list = subc->get_options([](const CLI::Option *opt) { return opt->get_name() == "--help-all"; });
-    EXPECT_EQ(help_opt_list.size(), 1u);
+    CHECK(1u == help_opt_list.size());
 }
 
-TEST_F(TApp, RequiredPosInSubcommand) {
+TEST_CASE_METHOD(TApp, "RequiredPosInSubcommand", "[subcom]") {
     app.require_subcommand();
     std::string bar;
 
@@ -806,13 +802,13 @@ TEST_F(TApp, RequiredPosInSubcommand) {
 
     args = {"foo", "abc"};
     run();
-    EXPECT_EQ(bar, "abc");
+    CHECK("abc" == bar);
     args = {"baz", "cba"};
     run();
-    EXPECT_EQ(bar, "cba");
+    CHECK("cba" == bar);
 
     args = {};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
 struct SubcommandProgram : public TApp {
@@ -839,54 +835,54 @@ struct SubcommandProgram : public TApp {
     }
 };
 
-TEST_F(SubcommandProgram, Working) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand Working", "[subcom]") {
     args = {"-d", "start", "-ffilename"};
 
     run();
 
-    EXPECT_EQ(1, dummy);
-    EXPECT_EQ(start, app.get_subcommands().at(0));
-    EXPECT_EQ("filename", file);
+    CHECK(dummy == 1);
+    CHECK(app.get_subcommands().at(0) == start);
+    CHECK(file == "filename");
 }
 
-TEST_F(SubcommandProgram, Spare) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand Spare", "[subcom]") {
     args = {"extra", "-d", "start", "-ffilename"};
 
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(SubcommandProgram, SpareSub) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand SpareSub", "[subcom]") {
     args = {"-d", "start", "spare", "-ffilename"};
 
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(SubcommandProgram, Multiple) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand Multiple", "[subcom]") {
     args = {"-d", "start", "-ffilename", "stop"};
 
     run();
-    EXPECT_EQ(2u, app.get_subcommands().size());
-    EXPECT_EQ(1, dummy);
-    EXPECT_EQ("filename", file);
+    CHECK(app.get_subcommands().size() == 2u);
+    CHECK(dummy == 1);
+    CHECK(file == "filename");
 }
 
-TEST_F(SubcommandProgram, MultipleOtherOrder) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand MultipleOtherOrder", "[subcom]") {
     args = {"start", "-d", "-ffilename", "stop"};
 
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(SubcommandProgram, MultipleArgs) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand MultipleArgs", "[subcom]") {
     args = {"start", "stop"};
 
     run();
 
-    EXPECT_EQ(2u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().size() == 2u);
 }
 
-TEST_F(SubcommandProgram, CaseCheck) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand CaseCheck", "[subcom]") {
     args = {"Start"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"start"};
     run();
@@ -898,32 +894,32 @@ TEST_F(SubcommandProgram, CaseCheck) {
     run();
 }
 
-TEST_F(TApp, SubcomInheritCaseCheck) {
+TEST_CASE_METHOD(TApp, "SubcomInheritCaseCheck", "[subcom]") {
     app.ignore_case();
     auto sub1 = app.add_subcommand("sub1");
     auto sub2 = app.add_subcommand("sub2");
 
     run();
-    EXPECT_EQ(0u, app.get_subcommands().size());
-    EXPECT_EQ(2u, app.get_subcommands({}).size());
-    EXPECT_EQ(1u, app.get_subcommands([](const CLI::App *s) { return s->get_name() == "sub1"; }).size());
+    CHECK(app.get_subcommands().size() == 0u);
+    CHECK(app.get_subcommands({}).size() == 2u);
+    CHECK(app.get_subcommands([](const CLI::App *s) { return s->get_name() == "sub1"; }).size() == 1u);
 
     args = {"SuB1"};
     run();
-    EXPECT_EQ(sub1, app.get_subcommands().at(0));
-    EXPECT_EQ(1u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().at(0) == sub1);
+    CHECK(app.get_subcommands().size() == 1u);
 
     app.clear();
-    EXPECT_EQ(0u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().size() == 0u);
 
     args = {"sUb2"};
     run();
-    EXPECT_EQ(sub2, app.get_subcommands().at(0));
+    CHECK(app.get_subcommands().at(0) == sub2);
 }
 
-TEST_F(SubcommandProgram, UnderscoreCheck) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand UnderscoreCheck", "[subcom]") {
     args = {"start_"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"start"};
     run();
@@ -935,54 +931,54 @@ TEST_F(SubcommandProgram, UnderscoreCheck) {
     run();
 }
 
-TEST_F(TApp, SubcomInheritUnderscoreCheck) {
+TEST_CASE_METHOD(TApp, "SubcomInheritUnderscoreCheck", "[subcom]") {
     app.ignore_underscore();
     auto sub1 = app.add_subcommand("sub_option1");
     auto sub2 = app.add_subcommand("sub_option2");
 
     run();
-    EXPECT_EQ(0u, app.get_subcommands().size());
-    EXPECT_EQ(2u, app.get_subcommands({}).size());
-    EXPECT_EQ(1u, app.get_subcommands([](const CLI::App *s) { return s->get_name() == "sub_option1"; }).size());
+    CHECK(app.get_subcommands().size() == 0u);
+    CHECK(app.get_subcommands({}).size() == 2u);
+    CHECK(app.get_subcommands([](const CLI::App *s) { return s->get_name() == "sub_option1"; }).size() == 1u);
 
     args = {"suboption1"};
     run();
-    EXPECT_EQ(sub1, app.get_subcommands().at(0));
-    EXPECT_EQ(1u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().at(0) == sub1);
+    CHECK(app.get_subcommands().size() == 1u);
 
     app.clear();
-    EXPECT_EQ(0u, app.get_subcommands().size());
+    CHECK(app.get_subcommands().size() == 0u);
 
     args = {"_suboption2"};
     run();
-    EXPECT_EQ(sub2, app.get_subcommands().at(0));
+    CHECK(app.get_subcommands().at(0) == sub2);
 }
 
-TEST_F(SubcommandProgram, HelpOrder) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand HelpOrder", "[subcom]") {
 
     args = {"-h"};
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 
     args = {"start", "-h"};
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 
     args = {"-h", "start"};
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 }
 
-TEST_F(SubcommandProgram, HelpAllOrder) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand HelpAllOrder", "[subcom]") {
 
     args = {"--help-all"};
-    EXPECT_THROW(run(), CLI::CallForAllHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForAllHelp);
 
     args = {"start", "--help-all"};
-    EXPECT_THROW(run(), CLI::CallForAllHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForAllHelp);
 
     args = {"--help-all", "start"};
-    EXPECT_THROW(run(), CLI::CallForAllHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForAllHelp);
 }
 
-TEST_F(SubcommandProgram, Callbacks) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand Callbacks", "[subcom]") {
 
     start->callback([]() { throw CLI::Success(); });
 
@@ -990,63 +986,63 @@ TEST_F(SubcommandProgram, Callbacks) {
 
     args = {"start"};
 
-    EXPECT_THROW(run(), CLI::Success);
+    CHECK_THROWS_AS(run(), CLI::Success);
 }
 
-TEST_F(SubcommandProgram, Groups) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand Groups", "[subcom]") {
 
     std::string help = app.help();
-    EXPECT_THAT(help, Not(HasSubstr("More Commands:")));
-    EXPECT_THAT(help, HasSubstr("Subcommands:"));
+    CHECK_THAT(help, !Contains("More Commands:"));
+    CHECK_THAT(help, Contains("Subcommands:"));
 
     start->group("More Commands");
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("More Commands:"));
-    EXPECT_THAT(help, HasSubstr("Subcommands:"));
+    CHECK_THAT(help, Contains("More Commands:"));
+    CHECK_THAT(help, Contains("Subcommands:"));
 
     // Case is ignored but for the first subcommand in a group.
     stop->group("more commands");
     help = app.help();
-    EXPECT_THAT(help, HasSubstr("More Commands:"));
-    EXPECT_THAT(help, Not(HasSubstr("Subcommands:")));
+    CHECK_THAT(help, Contains("More Commands:"));
+    CHECK_THAT(help, !Contains("Subcommands:"));
 }
 
-TEST_F(SubcommandProgram, ExtrasErrors) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand ExtrasErrors", "[subcom]") {
 
     args = {"one", "two", "start", "three", "four"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"start", "three", "four"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"one", "two"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(SubcommandProgram, OrderedExtras) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand OrderedExtras", "[subcom]") {
 
     app.allow_extras();
     args = {"one", "two", "start", "three", "four"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     start->allow_extras();
 
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(start->remaining(), std::vector<std::string>({"three", "four"}));
-    EXPECT_EQ(app.remaining(true), std::vector<std::string>({"one", "two", "three", "four"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == app.remaining());
+    CHECK(std::vector<std::string>({"three", "four"}) == start->remaining());
+    CHECK(std::vector<std::string>({"one", "two", "three", "four"}) == app.remaining(true));
 
     args = {"one", "two", "start", "three", "--", "four"};
 
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"one", "two", "four"}));
-    EXPECT_EQ(start->remaining(), std::vector<std::string>({"three"}));
-    EXPECT_EQ(app.remaining(true), std::vector<std::string>({"one", "two", "four", "three"}));
+    CHECK(std::vector<std::string>({"one", "two", "four"}) == app.remaining());
+    CHECK(std::vector<std::string>({"three"}) == start->remaining());
+    CHECK(std::vector<std::string>({"one", "two", "four", "three"}) == app.remaining(true));
 }
 
-TEST_F(SubcommandProgram, MixedOrderExtras) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand MixedOrderExtras", "[subcom]") {
 
     app.allow_extras();
     start->allow_extras();
@@ -1055,50 +1051,50 @@ TEST_F(SubcommandProgram, MixedOrderExtras) {
     args = {"one", "two", "start", "three", "four", "stop", "five", "six"};
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(start->remaining(), std::vector<std::string>({"three", "four"}));
-    EXPECT_EQ(stop->remaining(), std::vector<std::string>({"five", "six"}));
-    EXPECT_EQ(app.remaining(true), std::vector<std::string>({"one", "two", "three", "four", "five", "six"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == app.remaining());
+    CHECK(std::vector<std::string>({"three", "four"}) == start->remaining());
+    CHECK(std::vector<std::string>({"five", "six"}) == stop->remaining());
+    CHECK(std::vector<std::string>({"one", "two", "three", "four", "five", "six"}) == app.remaining(true));
 
     args = {"one", "two", "stop", "three", "four", "start", "five", "six"};
     run();
 
-    EXPECT_EQ(app.remaining(), std::vector<std::string>({"one", "two"}));
-    EXPECT_EQ(stop->remaining(), std::vector<std::string>({"three", "four"}));
-    EXPECT_EQ(start->remaining(), std::vector<std::string>({"five", "six"}));
-    EXPECT_EQ(app.remaining(true), std::vector<std::string>({"one", "two", "three", "four", "five", "six"}));
+    CHECK(std::vector<std::string>({"one", "two"}) == app.remaining());
+    CHECK(std::vector<std::string>({"three", "four"}) == stop->remaining());
+    CHECK(std::vector<std::string>({"five", "six"}) == start->remaining());
+    CHECK(std::vector<std::string>({"one", "two", "three", "four", "five", "six"}) == app.remaining(true));
 }
 
-TEST_F(SubcommandProgram, CallbackOrder) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand CallbackOrder", "[subcom]") {
     std::vector<int> callback_order;
     start->callback([&callback_order]() { callback_order.push_back(1); });
     stop->callback([&callback_order]() { callback_order.push_back(2); });
 
     args = {"start", "stop"};
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({1, 2}));
+    CHECK(std::vector<int>({1, 2}) == callback_order);
 
     callback_order.clear();
 
     args = {"stop", "start"};
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({2, 1}));
+    CHECK(std::vector<int>({2, 1}) == callback_order);
 }
 
-TEST_F(SubcommandProgram, CallbackOrderImmediate) {
+TEST_CASE_METHOD(SubcommandProgram, "Subcommand CallbackOrderImmediate", "[subcom]") {
     std::vector<int> callback_order;
     start->callback([&callback_order]() { callback_order.push_back(1); })->immediate_callback();
     stop->callback([&callback_order]() { callback_order.push_back(2); });
 
     args = {"start", "stop", "start"};
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({1, 1, 2}));
+    CHECK(std::vector<int>({1, 1, 2}) == callback_order);
 
     callback_order.clear();
 
     args = {"stop", "start", "stop", "start"};
     run();
-    EXPECT_EQ(callback_order, std::vector<int>({1, 1, 2}));
+    CHECK(std::vector<int>({1, 1, 2}) == callback_order);
 }
 
 struct ManySubcommands : public TApp {
@@ -1121,137 +1117,137 @@ struct ManySubcommands : public TApp {
     ManySubcommands &operator=(const ManySubcommands &) = delete;
 };
 
-TEST_F(ManySubcommands, Required1Exact) {
+TEST_CASE_METHOD(ManySubcommands, "Required1Exact", "[subcom]") {
     app.require_subcommand(1);
 
     run();
-    EXPECT_EQ(sub1->remaining(), vs_t({"sub2", "sub3"}));
-    EXPECT_EQ(app.remaining(true), vs_t({"sub2", "sub3"}));
+    CHECK(vs_t({"sub2", "sub3"}) == sub1->remaining());
+    CHECK(vs_t({"sub2", "sub3"}) == app.remaining(true));
 }
 
-TEST_F(ManySubcommands, Required2Exact) {
+TEST_CASE_METHOD(ManySubcommands, "Required2Exact", "[subcom]") {
     app.require_subcommand(2);
 
     run();
-    EXPECT_EQ(sub2->remaining(), vs_t({"sub3"}));
+    CHECK(vs_t({"sub3"}) == sub2->remaining());
 }
 
-TEST_F(ManySubcommands, Required4Failure) {
+TEST_CASE_METHOD(ManySubcommands, "Required4Failure", "[subcom]") {
     app.require_subcommand(4);
 
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManySubcommands, RemoveSub) {
+TEST_CASE_METHOD(ManySubcommands, "RemoveSub", "[subcom]") {
     run();
-    EXPECT_EQ(app.remaining_size(true), 0u);
+    CHECK(0u == app.remaining_size(true));
     app.remove_subcommand(sub1);
     app.allow_extras();
     run();
-    EXPECT_EQ(app.remaining_size(true), 1u);
+    CHECK(1u == app.remaining_size(true));
 }
 
-TEST_F(ManySubcommands, RemoveSubFail) {
+TEST_CASE_METHOD(ManySubcommands, "RemoveSubFail", "[subcom]") {
     auto sub_sub = sub1->add_subcommand("subsub");
-    EXPECT_FALSE(app.remove_subcommand(sub_sub));
-    EXPECT_TRUE(sub1->remove_subcommand(sub_sub));
-    EXPECT_FALSE(app.remove_subcommand(nullptr));
+    CHECK(!app.remove_subcommand(sub_sub));
+    CHECK(sub1->remove_subcommand(sub_sub));
+    CHECK(!app.remove_subcommand(nullptr));
 }
 
-TEST_F(ManySubcommands, manyIndexQuery) {
+TEST_CASE_METHOD(ManySubcommands, "manyIndexQuery", "[subcom]") {
     auto s1 = app.get_subcommand(0);
     auto s2 = app.get_subcommand(1);
     auto s3 = app.get_subcommand(2);
     auto s4 = app.get_subcommand(3);
-    EXPECT_EQ(s1, sub1);
-    EXPECT_EQ(s2, sub2);
-    EXPECT_EQ(s3, sub3);
-    EXPECT_EQ(s4, sub4);
-    EXPECT_THROW(app.get_subcommand(4), CLI::OptionNotFound);
+    CHECK(sub1 == s1);
+    CHECK(sub2 == s2);
+    CHECK(sub3 == s3);
+    CHECK(sub4 == s4);
+    CHECK_THROWS_AS(app.get_subcommand(4), CLI::OptionNotFound);
     auto s0 = app.get_subcommand();
-    EXPECT_EQ(s0, sub1);
+    CHECK(sub1 == s0);
 }
 
-TEST_F(ManySubcommands, manyIndexQueryPtr) {
+TEST_CASE_METHOD(ManySubcommands, "manyIndexQueryPtr", "[subcom]") {
     auto s1 = app.get_subcommand_ptr(0);
     auto s2 = app.get_subcommand_ptr(1);
     auto s3 = app.get_subcommand_ptr(2);
     auto s4 = app.get_subcommand_ptr(3);
-    EXPECT_EQ(s1.get(), sub1);
-    EXPECT_EQ(s2.get(), sub2);
-    EXPECT_EQ(s3.get(), sub3);
-    EXPECT_EQ(s4.get(), sub4);
-    EXPECT_THROW(app.get_subcommand_ptr(4), CLI::OptionNotFound);
+    CHECK(sub1 == s1.get());
+    CHECK(sub2 == s2.get());
+    CHECK(sub3 == s3.get());
+    CHECK(sub4 == s4.get());
+    CHECK_THROWS_AS(app.get_subcommand_ptr(4), CLI::OptionNotFound);
 }
 
-TEST_F(ManySubcommands, Required1Fuzzy) {
+TEST_CASE_METHOD(ManySubcommands, "Required1Fuzzy", "[subcom]") {
 
     app.require_subcommand(0, 1);
 
     run();
-    EXPECT_EQ(sub1->remaining(), vs_t({"sub2", "sub3"}));
+    CHECK(vs_t({"sub2", "sub3"}) == sub1->remaining());
 
     app.require_subcommand(-1);
 
     run();
-    EXPECT_EQ(sub1->remaining(), vs_t({"sub2", "sub3"}));
+    CHECK(vs_t({"sub2", "sub3"}) == sub1->remaining());
 }
 
-TEST_F(ManySubcommands, Required2Fuzzy) {
+TEST_CASE_METHOD(ManySubcommands, "Required2Fuzzy", "[subcom]") {
     app.require_subcommand(0, 2);
 
     run();
-    EXPECT_EQ(sub2->remaining(), vs_t({"sub3"}));
-    EXPECT_EQ(app.remaining(true), vs_t({"sub3"}));
+    CHECK(vs_t({"sub3"}) == sub2->remaining());
+    CHECK(vs_t({"sub3"}) == app.remaining(true));
 
     app.require_subcommand(-2);
 
     run();
-    EXPECT_EQ(sub2->remaining(), vs_t({"sub3"}));
+    CHECK(vs_t({"sub3"}) == sub2->remaining());
 }
 
-TEST_F(ManySubcommands, Unlimited) {
+TEST_CASE_METHOD(ManySubcommands, "Unlimited", "[subcom]") {
     run();
-    EXPECT_EQ(app.remaining(true), vs_t());
+    CHECK(vs_t() == app.remaining(true));
 
     app.require_subcommand();
 
     run();
-    EXPECT_EQ(app.remaining(true), vs_t());
+    CHECK(vs_t() == app.remaining(true));
 
     app.require_subcommand(2, 0);  // 2 or more
 
     run();
-    EXPECT_EQ(app.remaining(true), vs_t());
+    CHECK(vs_t() == app.remaining(true));
 }
 
-TEST_F(ManySubcommands, HelpFlags) {
+TEST_CASE_METHOD(ManySubcommands, "HelpFlags", "[subcom]") {
 
     args = {"-h"};
 
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 
     args = {"sub2", "-h"};
 
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 
     args = {"-h", "sub2"};
 
-    EXPECT_THROW(run(), CLI::CallForHelp);
+    CHECK_THROWS_AS(run(), CLI::CallForHelp);
 }
 
-TEST_F(ManySubcommands, MaxCommands) {
+TEST_CASE_METHOD(ManySubcommands, "MaxCommands", "[subcom]") {
 
     app.require_subcommand(2);
 
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     // The extra subcommand counts as an extra
     args = {"sub1", "sub2", "sub3"};
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(sub2->remaining().size(), 1u);
-    EXPECT_EQ(app.count_all(), 2u);
+    CHECK_NOTHROW(run());
+    CHECK(1u == sub2->remaining().size());
+    CHECK(2u == app.count_all());
 
     // Currently, setting sub2 to throw causes an extras error
     // In the future, would passing on up to app's extras be better?
@@ -1262,31 +1258,31 @@ TEST_F(ManySubcommands, MaxCommands) {
 
     args = {"sub1", "sub2"};
 
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub2", "sub3"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(ManySubcommands, SubcommandExclusion) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandExclusion", "[subcom]") {
 
     sub1->excludes(sub3);
     sub2->excludes(sub3);
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub2", "sub3"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
 
     args = {"sub1", "sub2", "sub4"};
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(app.count_all(), 3u);
+    CHECK_NOTHROW(run());
+    CHECK(3u == app.count_all());
 
     args = {"sub3", "sub4"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManySubcommands, SubcommandOptionExclusion) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandOptionExclusion", "[subcom]") {
 
     auto excluder_flag = app.add_flag("--exclude");
     sub1->excludes(excluder_flag)->fallthrough();
@@ -1294,78 +1290,78 @@ TEST_F(ManySubcommands, SubcommandOptionExclusion) {
     sub3->fallthrough();
     sub4->fallthrough();
     args = {"sub3", "sub4", "--exclude"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub3", "--exclude"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
-    EXPECT_TRUE(sub1->remove_excludes(excluder_flag));
-    EXPECT_NO_THROW(run());
-    EXPECT_FALSE(sub1->remove_excludes(excluder_flag));
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
+    CHECK(sub1->remove_excludes(excluder_flag));
+    CHECK_NOTHROW(run());
+    CHECK(!sub1->remove_excludes(excluder_flag));
 
     args = {"--exclude", "sub2", "sub4"};
-    EXPECT_THROW(run(), CLI::ExcludesError);
-    EXPECT_EQ(sub1->excludes(excluder_flag), sub1);
+    CHECK_THROWS_AS(run(), CLI::ExcludesError);
+    CHECK(sub1 == sub1->excludes(excluder_flag));
     args = {"sub1", "--exclude", "sub2", "sub4"};
     try {
         run();
     } catch(const CLI::ExcludesError &ee) {
-        EXPECT_NE(std::string(ee.what()).find("sub1"), std::string::npos);
+        CHECK(std::string::npos != std::string(ee.what()).find("sub1"));
     }
 }
 
-TEST_F(ManySubcommands, SubcommandNeeds) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandNeeds", "[subcom]") {
 
     sub1->needs(sub2);
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     sub1->needs(sub3);
     args = {"sub1", "sub2", "sub3"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub2", "sub4"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"sub1", "sub2", "sub4"};
     sub1->remove_needs(sub3);
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManySubcommands, SubcommandNeedsOptions) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandNeedsOptions", "[subcom]") {
 
     auto opt = app.add_flag("--subactive");
     sub1->needs(opt);
     sub1->fallthrough();
     args = {"sub1", "--subactive"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--subactive"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     auto opt2 = app.add_flag("--subactive2");
 
     sub1->needs(opt2);
     args = {"sub1", "--subactive"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
 
     args = {"--subactive", "--subactive2", "sub1"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     sub1->remove_needs(opt2);
     args = {"sub1", "--subactive"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManySubcommands, SubcommandNeedsOptionsCallbackOrdering) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandNeedsOptionsCallbackOrdering", "[subcom]") {
     int count{0};
     auto opt = app.add_flag("--subactive");
     app.add_flag("--flag1");
@@ -1373,135 +1369,135 @@ TEST_F(ManySubcommands, SubcommandNeedsOptionsCallbackOrdering) {
     sub1->fallthrough();
     sub1->parse_complete_callback([&count]() { ++count; });
     args = {"sub1", "--flag1", "sub1", "--subactive"};
-    EXPECT_THROW(run(), CLI::RequiresError);
+    CHECK_THROWS_AS(run(), CLI::RequiresError);
     // the subcommand has to pass validation by the first callback
     sub1->immediate_callback(false);
     // now since the callback executes after
 
-    EXPECT_NO_THROW(run());
-    EXPECT_EQ(count, 1);
+    CHECK_NOTHROW(run());
+    CHECK(1 == count);
     sub1->immediate_callback();
     args = {"--subactive", "sub1"};
     // now the required is processed first
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManySubcommands, SubcommandNeedsFail) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandNeedsFail", "[subcom]") {
 
     auto opt = app.add_flag("--subactive");
     auto opt2 = app.add_flag("--dummy");
     sub1->needs(opt);
-    EXPECT_THROW(sub1->needs((CLI::Option *)nullptr), CLI::OptionNotFound);
-    EXPECT_THROW(sub1->needs((CLI::App *)nullptr), CLI::OptionNotFound);
-    EXPECT_THROW(sub1->needs(sub1), CLI::OptionNotFound);
+    CHECK_THROWS_AS(sub1->needs((CLI::Option *)nullptr), CLI::OptionNotFound);
+    CHECK_THROWS_AS(sub1->needs((CLI::App *)nullptr), CLI::OptionNotFound);
+    CHECK_THROWS_AS(sub1->needs(sub1), CLI::OptionNotFound);
 
-    EXPECT_TRUE(sub1->remove_needs(opt));
-    EXPECT_FALSE(sub1->remove_needs(opt2));
-    EXPECT_FALSE(sub1->remove_needs(sub1));
+    CHECK(sub1->remove_needs(opt));
+    CHECK(!sub1->remove_needs(opt2));
+    CHECK(!sub1->remove_needs(sub1));
 }
 
-TEST_F(ManySubcommands, SubcommandRequired) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandRequired", "[subcom]") {
 
     sub1->required();
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub2", "sub3"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub3", "sub4"};
-    EXPECT_THROW(run(), CLI::RequiredError);
+    CHECK_THROWS_AS(run(), CLI::RequiredError);
 }
 
-TEST_F(ManySubcommands, SubcommandDisabled) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandDisabled", "[subcom]") {
 
     sub3->disabled();
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub1", "sub2", "sub3"};
     app.allow_extras(false);
     sub2->allow_extras(false);
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     args = {"sub3", "sub4"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     sub3->disabled(false);
     args = {"sub3", "sub4"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 }
 
-TEST_F(ManySubcommands, SubcommandTriggeredOff) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandTriggeredOff", "[subcom]") {
 
     app.allow_extras(false);
     sub1->allow_extras(false);
     sub2->allow_extras(false);
     CLI::TriggerOff(sub1, sub2);
     args = {"sub1", "sub2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"sub2", "sub1", "sub3"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
     CLI::TriggerOff(sub1, {sub3, sub4});
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     args = {"sub1", "sub2", "sub4"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(ManySubcommands, SubcommandTriggeredOn) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandTriggeredOn", "[subcom]") {
 
     app.allow_extras(false);
     sub1->allow_extras(false);
     sub2->allow_extras(false);
     CLI::TriggerOn(sub1, sub2);
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     args = {"sub2", "sub1", "sub4"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
     CLI::TriggerOn(sub1, {sub3, sub4});
     sub2->disabled_by_default(false);
     sub2->disabled(false);
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
     args = {"sub3", "sub1", "sub2"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(ManySubcommands, SubcommandSilence) {
+TEST_CASE_METHOD(ManySubcommands, "SubcommandSilence", "[subcom]") {
 
     sub1->silent();
     args = {"sub1", "sub2"};
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
 
     auto subs = app.get_subcommands();
-    EXPECT_EQ(subs.size(), 1U);
+    CHECK(1U == subs.size());
     sub1->silent(false);
-    EXPECT_FALSE(sub1->get_silent());
+    CHECK(!sub1->get_silent());
     run();
     subs = app.get_subcommands();
-    EXPECT_EQ(subs.size(), 2U);
+    CHECK(2U == subs.size());
 }
 
-TEST_F(TApp, UnnamedSub) {
+TEST_CASE_METHOD(TApp, "UnnamedSub", "[subcom]") {
     double val{0.0};
     auto sub = app.add_subcommand("", "empty name");
     auto opt = sub->add_option("-v,--value", val);
     args = {"-v", "4.56"};
 
     run();
-    EXPECT_EQ(val, 4.56);
+    CHECK(4.56 == val);
     // make sure unnamed sub options can be found from the main app
     auto opt2 = app.get_option("-v");
-    EXPECT_EQ(opt, opt2);
+    CHECK(opt2 == opt);
 
-    EXPECT_THROW(app.get_option("--vvvv"), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.get_option("--vvvv"), CLI::OptionNotFound);
     // now test in the constant context
     const auto &appC = app;
     auto opt3 = appC.get_option("-v");
-    EXPECT_EQ(opt3->get_name(), "--value");
-    EXPECT_THROW(appC.get_option("--vvvv"), CLI::OptionNotFound);
+    CHECK("--value" == opt3->get_name());
+    CHECK_THROWS_AS(appC.get_option("--vvvv"), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, UnnamedSubMix) {
+TEST_CASE_METHOD(TApp, "UnnamedSubMix", "[subcom]") {
     double val{0.0}, val2{0.0}, val3{0.0};
     app.add_option("-t", val2);
     auto sub1 = app.add_subcommand("", "empty name");
@@ -1511,13 +1507,13 @@ TEST_F(TApp, UnnamedSubMix) {
     args = {"-m", "4.56", "-t", "5.93", "-v", "-3"};
 
     run();
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(val3, 4.56);
-    EXPECT_EQ(app.count_all(), 3u);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(4.56 == val3);
+    CHECK(3u == app.count_all());
 }
 
-TEST_F(TApp, UnnamedSubMixExtras) {
+TEST_CASE_METHOD(TApp, "UnnamedSubMixExtras", "[subcom]") {
     double val{0.0}, val2{0.0};
     app.add_option("-t", val2);
     auto sub = app.add_subcommand("", "empty name");
@@ -1525,26 +1521,26 @@ TEST_F(TApp, UnnamedSubMixExtras) {
     args = {"-m", "4.56", "-t", "5.93", "-v", "-3"};
     app.allow_extras();
     run();
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(app.remaining_size(), 2u);
-    EXPECT_EQ(sub->remaining_size(), 0u);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(2u == app.remaining_size());
+    CHECK(0u == sub->remaining_size());
 }
 
-TEST_F(TApp, UnnamedSubNoExtras) {
+TEST_CASE_METHOD(TApp, "UnnamedSubNoExtras", "[subcom]") {
     double val{0.0}, val2{0.0};
     app.add_option("-t", val2);
     auto sub = app.add_subcommand();
     sub->add_option("-v,--value", val);
     args = {"-t", "5.93", "-v", "-3"};
     run();
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(app.remaining_size(), 0u);
-    EXPECT_EQ(sub->remaining_size(), 0u);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(0u == app.remaining_size());
+    CHECK(0u == sub->remaining_size());
 }
 
-TEST_F(TApp, SubcommandAlias) {
+TEST_CASE_METHOD(TApp, "SubcommandAlias", "[subcom]") {
     double val{0.0};
     auto sub = app.add_subcommand("sub1");
     sub->alias("sub2");
@@ -1552,27 +1548,27 @@ TEST_F(TApp, SubcommandAlias) {
     sub->add_option("-v,--value", val);
     args = {"sub1", "-v", "-3"};
     run();
-    EXPECT_EQ(val, -3.0);
+    CHECK(-3.0 == val);
 
     args = {"sub2", "--value", "-5"};
     run();
-    EXPECT_EQ(val, -5.0);
+    CHECK(-5.0 == val);
 
     args = {"sub3", "-v", "7"};
     run();
-    EXPECT_EQ(val, 7);
+    CHECK(7 == val);
 
     auto &al = sub->get_aliases();
-    ASSERT_GE(al.size(), 2U);
+    REQUIRE(2U <= al.size());
 
-    EXPECT_EQ(al[0], "sub2");
-    EXPECT_EQ(al[1], "sub3");
+    CHECK("sub2" == al[0]);
+    CHECK("sub3" == al[1]);
 
     sub->clear_aliases();
-    EXPECT_TRUE(al.empty());
+    CHECK(al.empty());
 }
 
-TEST_F(TApp, SubcommandAliasIgnoreCaseUnderscore) {
+TEST_CASE_METHOD(TApp, "SubcommandAliasIgnoreCaseUnderscore", "[subcom]") {
     double val{0.0};
     auto sub = app.add_subcommand("sub1");
     sub->alias("sub2");
@@ -1581,63 +1577,63 @@ TEST_F(TApp, SubcommandAliasIgnoreCaseUnderscore) {
     sub->add_option("-v,--value", val);
     args = {"sub1", "-v", "-3"};
     run();
-    EXPECT_EQ(val, -3.0);
+    CHECK(-3.0 == val);
 
     args = {"SUB2", "--value", "-5"};
     run();
-    EXPECT_EQ(val, -5.0);
+    CHECK(-5.0 == val);
 
     args = {"sUb3", "-v", "7"};
     run();
-    EXPECT_EQ(val, 7);
+    CHECK(7 == val);
     sub->ignore_underscore();
     args = {"sub_1", "-v", "-3"};
     run();
-    EXPECT_EQ(val, -3.0);
+    CHECK(-3.0 == val);
 
     args = {"SUB_2", "--value", "-5"};
     run();
-    EXPECT_EQ(val, -5.0);
+    CHECK(-5.0 == val);
 
     args = {"sUb_3", "-v", "7"};
     run();
-    EXPECT_EQ(val, 7);
+    CHECK(7 == val);
 
     sub->ignore_case(false);
     args = {"sub_1", "-v", "-3"};
     run();
-    EXPECT_EQ(val, -3.0);
+    CHECK(-3.0 == val);
 
     args = {"SUB_2", "--value", "-5"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"sUb_3", "-v", "7"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 }
 
-TEST_F(TApp, OptionGroupAlias) {
+TEST_CASE_METHOD(TApp, "OptionGroupAlias", "[subcom]") {
     double val{0.0};
     auto sub = app.add_option_group("sub1");
     sub->alias("sub2");
     sub->alias("sub3");
     sub->add_option("-v,--value", val);
     args = {"sub1", "-v", "-3"};
-    EXPECT_THROW(run(), CLI::ExtrasError);
+    CHECK_THROWS_AS(run(), CLI::ExtrasError);
 
     args = {"sub2", "--value", "-5"};
     run();
-    EXPECT_EQ(val, -5.0);
+    CHECK(-5.0 == val);
 
     args = {"sub3", "-v", "7"};
     run();
-    EXPECT_EQ(val, 7);
+    CHECK(7 == val);
 
     args = {"-v", "-3"};
     run();
-    EXPECT_EQ(val, -3);
+    CHECK(-3 == val);
 }
 
-TEST_F(TApp, subcommand_help) {
+TEST_CASE_METHOD(TApp, "subcommand_help", "[subcom]") {
     auto sub1 = app.add_subcommand("help")->silent();
     bool flag{false};
     app.add_flag("--one", flag, "FLAGGER");
@@ -1650,52 +1646,52 @@ TEST_F(TApp, subcommand_help) {
         called = true;
     }
     auto helpstr = app.help();
-    EXPECT_THAT(helpstr, HasSubstr("FLAGGER"));
-    EXPECT_TRUE(called);
+    CHECK_THAT(helpstr, Contains("FLAGGER"));
+    CHECK(called);
 }
 
-TEST_F(TApp, AliasErrors) {
+TEST_CASE_METHOD(TApp, "AliasErrors", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     auto sub2 = app.add_subcommand("sub2");
 
-    EXPECT_THROW(sub2->alias("this is a not a valid alias"), CLI::IncorrectConstruction);
-    EXPECT_THROW(sub2->alias("-alias"), CLI::IncorrectConstruction);
-    EXPECT_THROW(sub2->alias("alia$"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(sub2->alias("this is a not a valid alias"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(sub2->alias("-alias"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(sub2->alias("alia$"), CLI::IncorrectConstruction);
 
-    EXPECT_THROW(app.add_subcommand("--bad_subcommand_name", "documenting the bad subcommand"),
-                 CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_subcommand("--bad_subcommand_name", "documenting the bad subcommand"),
+                    CLI::IncorrectConstruction);
 
-    EXPECT_THROW(app.add_subcommand("documenting a subcommand", "sub3"), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app.add_subcommand("documenting a subcommand", "sub3"), CLI::IncorrectConstruction);
     // cannot alias to an existing subcommand
-    EXPECT_THROW(sub2->alias("sub1"), CLI::OptionAlreadyAdded);
-    EXPECT_THROW(sub1->alias("sub2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->alias("sub1"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub1->alias("sub2"), CLI::OptionAlreadyAdded);
     // aliasing to an existing name should be allowed
-    EXPECT_NO_THROW(sub1->alias(sub1->get_name()));
+    CHECK_NOTHROW(sub1->alias(sub1->get_name()));
 
     sub1->alias("les1")->alias("les2")->alias("les_3");
     sub2->alias("s2les1")->alias("s2les2")->alias("s2les3");
 
-    EXPECT_THROW(sub2->alias("les2"), CLI::OptionAlreadyAdded);
-    EXPECT_THROW(sub1->alias("s2les2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->alias("les2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub1->alias("s2les2"), CLI::OptionAlreadyAdded);
 
-    EXPECT_THROW(sub2->name("sub1"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->name("sub1"), CLI::OptionAlreadyAdded);
     sub2->ignore_underscore();
-    EXPECT_THROW(sub2->alias("les3"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->alias("les3"), CLI::OptionAlreadyAdded);
 }
 // test adding a subcommand via the pointer
-TEST_F(TApp, ExistingSubcommandMatch) {
+TEST_CASE_METHOD(TApp, "ExistingSubcommandMatch", "[subcom]") {
     auto sshared = std::make_shared<CLI::App>("documenting the subcommand", "sub1");
     sshared->alias("sub2")->alias("sub3");
 
-    EXPECT_EQ(sshared->get_name(), "sub1");
+    CHECK("sub1" == sshared->get_name());
     app.add_subcommand("sub1");
 
     try {
         app.add_subcommand(sshared);
         // this should throw the next line should never be reached
-        EXPECT_FALSE(true);
+        CHECK(!true);
     } catch(const CLI::OptionAlreadyAdded &oaa) {
-        EXPECT_THAT(oaa.what(), HasSubstr("sub1"));
+        CHECK_THAT(oaa.what(), Contains("sub1"));
     }
     sshared->name("osub");
     app.add_subcommand("sub2");
@@ -1703,35 +1699,35 @@ TEST_F(TApp, ExistingSubcommandMatch) {
     try {
         app.add_subcommand(sshared);
         // this should throw the next line should never be reached
-        EXPECT_FALSE(true);
+        CHECK(!true);
     } catch(const CLI::OptionAlreadyAdded &oaa) {
-        EXPECT_THAT(oaa.what(), HasSubstr("sub2"));
+        CHECK_THAT(oaa.what(), Contains("sub2"));
     }
     // now check that disabled subcommands can be added regardless of name
     sshared->name("sub1");
     sshared->disabled();
-    EXPECT_NO_THROW(app.add_subcommand(sshared));
+    CHECK_NOTHROW(app.add_subcommand(sshared));
 }
 
-TEST_F(TApp, AliasErrorsInOptionGroup) {
+TEST_CASE_METHOD(TApp, "AliasErrorsInOptionGroup", "[subcom]") {
     auto sub1 = app.add_subcommand("sub1");
     auto g2 = app.add_option_group("g1");
     auto sub2 = g2->add_subcommand("sub2");
 
     // cannot alias to an existing subcommand even if it is in an option group
-    EXPECT_THROW(sub2->alias("sub1"), CLI::OptionAlreadyAdded);
-    EXPECT_THROW(sub1->alias("sub2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->alias("sub1"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub1->alias("sub2"), CLI::OptionAlreadyAdded);
 
     sub1->alias("les1")->alias("les2")->alias("les3");
     sub2->alias("s2les1")->alias("s2les2")->alias("s2les3");
 
-    EXPECT_THROW(sub2->alias("les2"), CLI::OptionAlreadyAdded);
-    EXPECT_THROW(sub1->alias("s2les2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->alias("les2"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub1->alias("s2les2"), CLI::OptionAlreadyAdded);
 
-    EXPECT_THROW(sub2->name("sub1"), CLI::OptionAlreadyAdded);
+    CHECK_THROWS_AS(sub2->name("sub1"), CLI::OptionAlreadyAdded);
 }
 
-TEST(SharedSubTests, SharedSubcommand) {
+TEST_CASE("SharedSubTests: SharedSubcommand", "[subcom]") {
     double val{0.0}, val2{0.0}, val3{0.0}, val4{0.0};
     CLI::App app1{"test program1"};
 
@@ -1745,7 +1741,7 @@ TEST(SharedSubTests, SharedSubcommand) {
     auto subown = app1.get_subcommand_ptr(sub);
     // add the extracted subcommand to a different app
     app2.add_subcommand(std::move(subown));
-    EXPECT_THROW(app2.add_subcommand(CLI::App_p{}), CLI::IncorrectConstruction);
+    CHECK_THROWS_AS(app2.add_subcommand(CLI::App_p{}), CLI::IncorrectConstruction);
     input_t args1 = {"-m", "4.56", "-t", "5.93", "-v", "-3"};
     input_t args2 = {"-m", "4.56", "-g", "8.235"};
     std::reverse(std::begin(args1), std::end(args1));
@@ -1755,13 +1751,13 @@ TEST(SharedSubTests, SharedSubcommand) {
 
     app2.parse(args2);
 
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(val3, 4.56);
-    EXPECT_EQ(val4, 8.235);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(4.56 == val3);
+    CHECK(8.235 == val4);
 }
 
-TEST(SharedSubTests, SharedSubIndependent) {
+TEST_CASE("SharedSubTests: SharedSubIndependent", "[subcom]") {
     double val{0.0}, val2{0.0}, val4{0.0};
     CLI::App_p app1 = std::make_shared<CLI::App>("test program1");
     app1->allow_extras();
@@ -1784,12 +1780,12 @@ TEST(SharedSubTests, SharedSubIndependent) {
     // parse with the extracted subcommand
     subown->parse(args2);
 
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(val4, 8.235);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(8.235 == val4);
 }
 
-TEST(SharedSubTests, SharedSubIndependentReuse) {
+TEST_CASE("SharedSubTests: SharedSubIndependentReuse", "[subcom]") {
     double val{0.0}, val2{0.0}, val4{0.0};
     CLI::App_p app1 = std::make_shared<CLI::App>("test program1");
     app1->allow_extras();
@@ -1809,60 +1805,60 @@ TEST(SharedSubTests, SharedSubIndependentReuse) {
     // parse with the extracted subcommand
     subown->parse("program1 -m 4.56 -g 8.235", true);
 
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
-    EXPECT_EQ(val4, 8.235);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
+    CHECK(8.235 == val4);
     val = 0.0;
     val2 = 0.0;
-    EXPECT_EQ(subown->get_name(), "program1");
+    CHECK("program1" == subown->get_name());
     // this tests the name reset in subcommand since it was automatic
     app1->parse(args2);
-    EXPECT_EQ(val, -3.0);
-    EXPECT_EQ(val2, 5.93);
+    CHECK(-3.0 == val);
+    CHECK(5.93 == val2);
 }
 
-TEST_F(ManySubcommands, getSubtests) {
+TEST_CASE_METHOD(ManySubcommands, "getSubtests", "[subcom]") {
     CLI::App_p sub2p = app.get_subcommand_ptr(sub2);
-    EXPECT_EQ(sub2p.get(), sub2);
-    EXPECT_THROW(app.get_subcommand_ptr(nullptr), CLI::OptionNotFound);
-    EXPECT_THROW(app.get_subcommand(nullptr), CLI::OptionNotFound);
+    CHECK(sub2 == sub2p.get());
+    CHECK_THROWS_AS(app.get_subcommand_ptr(nullptr), CLI::OptionNotFound);
+    CHECK_THROWS_AS(app.get_subcommand(nullptr), CLI::OptionNotFound);
     CLI::App_p sub3p = app.get_subcommand_ptr(2);
-    EXPECT_EQ(sub3p.get(), sub3);
+    CHECK(sub3 == sub3p.get());
 }
 
-TEST_F(ManySubcommands, defaultDisabledSubcommand) {
+TEST_CASE_METHOD(ManySubcommands, "defaultDisabledSubcommand", "[subcom]") {
 
     sub1->fallthrough();
     sub2->disabled_by_default();
     run();
     auto rem = app.remaining();
-    EXPECT_EQ(rem.size(), 1u);
-    EXPECT_EQ(rem[0], "sub2");
-    EXPECT_TRUE(sub2->get_disabled_by_default());
+    CHECK(1u == rem.size());
+    CHECK("sub2" == rem[0]);
+    CHECK(sub2->get_disabled_by_default());
     sub2->disabled(false);
-    EXPECT_FALSE(sub2->get_disabled());
+    CHECK(!sub2->get_disabled());
     run();
     // this should disable it again even though it was disabled
     rem = app.remaining();
-    EXPECT_EQ(rem.size(), 1u);
-    EXPECT_EQ(rem[0], "sub2");
-    EXPECT_TRUE(sub2->get_disabled_by_default());
-    EXPECT_TRUE(sub2->get_disabled());
+    CHECK(1u == rem.size());
+    CHECK("sub2" == rem[0]);
+    CHECK(sub2->get_disabled_by_default());
+    CHECK(sub2->get_disabled());
 }
 
-TEST_F(ManySubcommands, defaultEnabledSubcommand) {
+TEST_CASE_METHOD(ManySubcommands, "defaultEnabledSubcommand", "[subcom]") {
 
     sub2->enabled_by_default();
     run();
     auto rem = app.remaining();
-    EXPECT_EQ(rem.size(), 0u);
-    EXPECT_TRUE(sub2->get_enabled_by_default());
+    CHECK(0u == rem.size());
+    CHECK(sub2->get_enabled_by_default());
     sub2->disabled();
-    EXPECT_TRUE(sub2->get_disabled());
+    CHECK(sub2->get_disabled());
     run();
     // this should disable it again even though it was disabled
     rem = app.remaining();
-    EXPECT_EQ(rem.size(), 0u);
-    EXPECT_TRUE(sub2->get_enabled_by_default());
-    EXPECT_FALSE(sub2->get_disabled());
+    CHECK(0u == rem.size());
+    CHECK(sub2->get_enabled_by_default());
+    CHECK(!sub2->get_disabled());
 }
diff --git a/packages/CLI11/tests/TimerTest.cpp b/packages/CLI11/tests/TimerTest.cpp
index 51d8bb09b50c79b79ff62a37224ef79253c4dbed..8c88f478ace08a3e192c0439717ec44ad32b9ab9 100644
--- a/packages/CLI11/tests/TimerTest.cpp
+++ b/packages/CLI11/tests/TimerTest.cpp
@@ -5,66 +5,66 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "CLI/Timer.hpp"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
+
+#include "catch.hpp"
 #include <chrono>
 #include <sstream>
 #include <string>
 #include <thread>
 
-using ::testing::HasSubstr;
+using Catch::Matchers::Contains;
 
-TEST(Timer, MSTimes) {
+TEST_CASE("Timer: MSTimes", "[timer]") {
     CLI::Timer timer{"My Timer"};
     std::this_thread::sleep_for(std::chrono::milliseconds(123));
     std::string output = timer.to_string();
     std::string new_output = (timer / 1000000).to_string();
-    EXPECT_THAT(output, HasSubstr("My Timer"));
-    EXPECT_THAT(output, HasSubstr(" ms"));
-    EXPECT_THAT(new_output, HasSubstr(" ns"));
+    CHECK_THAT(output, Contains("My Timer"));
+    CHECK_THAT(output, Contains(" ms"));
+    CHECK_THAT(new_output, Contains(" ns"));
 }
 
 /* Takes too long
-TEST(Timer, STimes) {
+TEST_CASE("Timer: STimes", "[timer]") {
     CLI::Timer timer;
     std::this_thread::sleep_for(std::chrono::seconds(1));
     std::string output = timer.to_string();
-    EXPECT_THAT(output, HasSubstr(" s"));
+    CHECK_THAT (output, Contains(" s"));
 }
 */
 
 // Fails on Windows
-// TEST(Timer, UStimes) {
+// TEST_CASE("Timer: UStimes", "[timer]") {
 //    CLI::Timer timer;
 //    std::this_thread::sleep_for(std::chrono::microseconds(2));
 //    std::string output = timer.to_string();
-//    EXPECT_THAT(output, HasSubstr(" ms"));
+//    CHECK_THAT (output, Contains(" ms"));
 //}
 
-TEST(Timer, BigTimer) {
+TEST_CASE("Timer: BigTimer", "[timer]") {
     CLI::Timer timer{"My Timer", CLI::Timer::Big};
     std::string output = timer.to_string();
-    EXPECT_THAT(output, HasSubstr("Time ="));
-    EXPECT_THAT(output, HasSubstr("-----------"));
+    CHECK_THAT(output, Contains("Time ="));
+    CHECK_THAT(output, Contains("-----------"));
 }
 
-TEST(Timer, AutoTimer) {
+TEST_CASE("Timer: AutoTimer", "[timer]") {
     CLI::AutoTimer timer;
     std::string output = timer.to_string();
-    EXPECT_THAT(output, HasSubstr("Timer"));
+    CHECK_THAT(output, Contains("Timer"));
 }
 
-TEST(Timer, PrintTimer) {
+TEST_CASE("Timer: PrintTimer", "[timer]") {
     std::stringstream out;
     CLI::AutoTimer timer;
     out << timer;
     std::string output = out.str();
-    EXPECT_THAT(output, HasSubstr("Timer"));
+    CHECK_THAT(output, Contains("Timer"));
 }
 
-TEST(Timer, TimeItTimer) {
+TEST_CASE("Timer: TimeItTimer", "[timer]") {
     CLI::Timer timer;
     std::string output = timer.time_it([]() { std::this_thread::sleep_for(std::chrono::milliseconds(10)); }, .1);
     std::cout << output << std::endl;
-    EXPECT_THAT(output, HasSubstr("ms"));
+    CHECK_THAT(output, Contains("ms"));
 }
diff --git a/packages/CLI11/tests/TransformTest.cpp b/packages/CLI11/tests/TransformTest.cpp
index 53df504acadf14e91cfb7b58bde88917593664b9..84ac544a4d003778cbcd73797034eb70c5612d98 100644
--- a/packages/CLI11/tests/TransformTest.cpp
+++ b/packages/CLI11/tests/TransformTest.cpp
@@ -20,98 +20,98 @@
 #endif
 #endif
 
-TEST_F(TApp, SimpleTransform) {
+TEST_CASE_METHOD(TApp, "SimpleTransform", "[transform]") {
     int value{0};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer({{"one", std::string("1")}}));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, SimpleTransformInitList) {
+TEST_CASE_METHOD(TApp, "SimpleTransformInitList", "[transform]") {
     int value{0};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer({{"one", "1"}}));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, SimpleNumericalTransform) {
+TEST_CASE_METHOD(TApp, "SimpleNumericalTransform", "[transform]") {
     int value{0};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(CLI::TransformPairs<int>{{"one", 1}}));
     args = {"-s", "one"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, EnumTransform) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
-    test value{test::val2};
+TEST_CASE_METHOD(TApp, "EnumTransform", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
+    test_cli value{test_cli::val2};
     auto opt = app.add_option("-s", value)
-                   ->transform(CLI::Transformer(
-                       CLI::TransformPairs<test>{{"val1", test::val1}, {"val2", test::val2}, {"val3", test::val3}}));
+                   ->transform(CLI::Transformer(CLI::TransformPairs<test_cli>{
+                       {"val1", test_cli::val1}, {"val2", test_cli::val2}, {"val3", test_cli::val3}}));
     args = {"-s", "val1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "val2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "val3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val4"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
     // transformer doesn't do any checking so this still works
     args = {"-s", "5"};
     run();
-    EXPECT_EQ(static_cast<std::int16_t>(value), std::int16_t(5));
+    CHECK(std::int16_t(5) == static_cast<std::int16_t>(value));
 }
 
-TEST_F(TApp, EnumCheckedTransform) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
-    test value{test::val1};
+TEST_CASE_METHOD(TApp, "EnumCheckedTransform", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
+    test_cli value{test_cli::val1};
     auto opt = app.add_option("-s", value)
-                   ->transform(CLI::CheckedTransformer(
-                       CLI::TransformPairs<test>{{"val1", test::val1}, {"val2", test::val2}, {"val3", test::val3}}));
+                   ->transform(CLI::CheckedTransformer(CLI::TransformPairs<test_cli>{
+                       {"val1", test_cli::val1}, {"val2", test_cli::val2}, {"val3", test_cli::val3}}));
     args = {"-s", "val1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "val2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "val3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "17"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-s", "5"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
 // from jzakrzewski Issue #330
-TEST_F(TApp, EnumCheckedDefaultTransform) {
+TEST_CASE_METHOD(TApp, "EnumCheckedDefaultTransform", "[transform]") {
     enum class existing : std::int16_t { abort, overwrite, remove };
     app.add_option("--existing", "What to do if file already exists in the destination")
         ->transform(
@@ -122,14 +122,14 @@ TEST_F(TApp, EnumCheckedDefaultTransform) {
         ->default_val("abort");
     args = {"--existing", "overwrite"};
     run();
-    EXPECT_EQ(app.get_option("--existing")->as<existing>(), existing::overwrite);
+    CHECK(existing::overwrite == app.get_option("--existing")->as<existing>());
     args.clear();
     run();
-    EXPECT_EQ(app.get_option("--existing")->as<existing>(), existing::abort);
+    CHECK(existing::abort == app.get_option("--existing")->as<existing>());
 }
 
 // test from https://github.com/CLIUtils/CLI11/issues/369  [Jakub Zakrzewski](https://github.com/jzakrzewski)
-TEST_F(TApp, EnumCheckedDefaultTransformCallback) {
+TEST_CASE_METHOD(TApp, "EnumCheckedDefaultTransformCallback", "[transform]") {
     enum class existing : std::int16_t { abort, overwrite, remove };
     auto cmd = std::make_shared<CLI::App>("deploys the repository somewhere", "deploy");
     cmd->add_option("--existing", "What to do if file already exists in the destination")
@@ -140,60 +140,60 @@ TEST_F(TApp, EnumCheckedDefaultTransformCallback) {
                                                                               {"remove", existing::remove}}))
         ->default_val("abort");
 
-    cmd->callback([cmd]() { EXPECT_EQ(cmd->get_option("--existing")->as<existing>(), existing::abort); });
+    cmd->callback([cmd]() { CHECK(cmd->get_option("--existing")->as<existing>() == existing::abort); });
     app.add_subcommand(cmd);
 
     args = {"deploy"};
     run();
 }
 
-TEST_F(TApp, SimpleTransformFn) {
+TEST_CASE_METHOD(TApp, "SimpleTransformFn", "[transform]") {
     int value{0};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer({{"one", "1"}}, CLI::ignore_case));
     args = {"-s", "ONE"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
 #if defined(CLI11_HAS_STRING_VIEW)
-TEST_F(TApp, StringViewTransformFn) {
+TEST_CASE_METHOD(TApp, "StringViewTransformFn", "[transform]") {
     std::string value;
     std::map<std::string_view, std::string_view> map = {// key length > std::string().capacity() [SSO length]
                                                         {"a-rather-long-argument", "mapped"}};
     app.add_option("-s", value)->transform(CLI::CheckedTransformer(map));
     args = {"-s", "a-rather-long-argument"};
     run();
-    EXPECT_EQ(value, "mapped");
+    CHECK("mapped" == value);
 }
 
 #endif
 
-TEST_F(TApp, SimpleNumericalTransformFn) {
+TEST_CASE_METHOD(TApp, "SimpleNumericalTransformFn", "[transform]") {
     int value{0};
     auto opt =
         app.add_option("-s", value)
             ->transform(CLI::Transformer(std::vector<std::pair<std::string, int>>{{"one", 1}}, CLI::ignore_case));
     args = {"-s", "ONe"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, SimpleNumericalTransformFnVector) {
+TEST_CASE_METHOD(TApp, "SimpleNumericalTransformFnVector", "[transform]") {
     std::vector<std::pair<std::string, int>> conversions{{"one", 1}, {"two", 2}};
     int value{0};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(conversions, CLI::ignore_case));
     args = {"-s", "ONe"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
-TEST_F(TApp, SimpleNumericalTransformFnArray) {
+TEST_CASE_METHOD(TApp, "SimpleNumericalTransformFnArray", "[transform]") {
     std::array<std::pair<std::string, int>, 2> conversions;
     conversions[0] = std::make_pair(std::string("one"), 1);
     conversions[1] = std::make_pair(std::string("two"), 2);
@@ -202,14 +202,14 @@ TEST_F(TApp, SimpleNumericalTransformFnArray) {
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(conversions, CLI::ignore_case));
     args = {"-s", "ONe"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 }
 
 #ifdef CLI11_CPP14
 // zero copy constexpr array operation with transformer example and test
-TEST_F(TApp, SimpleNumericalTransformFnconstexprArray) {
+TEST_CASE_METHOD(TApp, "SimpleNumericalTransformFnconstexprArray", "[transform]") {
     constexpr std::pair<const char *, int> p1{"one", 1};
     constexpr std::pair<const char *, int> p2{"two", 2};
     constexpr std::array<std::pair<const char *, int>, 2> conversions_c{{p1, p2}};
@@ -218,128 +218,129 @@ TEST_F(TApp, SimpleNumericalTransformFnconstexprArray) {
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(&conversions_c, CLI::ignore_case));
     args = {"-s", "ONe"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(1 == value);
 
     args = {"-s", "twO"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, 2);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(2 == value);
 }
 #endif
 
-TEST_F(TApp, EnumTransformFn) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
-    test value{test::val2};
+TEST_CASE_METHOD(TApp, "EnumTransformFn", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
+    test_cli value{test_cli::val2};
     auto opt = app.add_option("-s", value)
-                   ->transform(CLI::Transformer(
-                       CLI::TransformPairs<test>{{"val1", test::val1}, {"val2", test::val2}, {"val3", test::val3}},
-                       CLI::ignore_case,
-                       CLI::ignore_underscore));
+                   ->transform(CLI::Transformer(CLI::TransformPairs<test_cli>{{"val1", test_cli::val1},
+                                                                              {"val2", test_cli::val2},
+                                                                              {"val3", test_cli::val3}},
+                                                CLI::ignore_case,
+                                                CLI::ignore_underscore));
     args = {"-s", "val_1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "VAL_2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "VAL3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val_4"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, EnumTransformFnMap) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
-    std::map<std::string, test> map{{"val1", test::val1}, {"val2", test::val2}, {"val3", test::val3}};
-    test value{test::val3};
+TEST_CASE_METHOD(TApp, "EnumTransformFnMap", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17 };
+    std::map<std::string, test_cli> map{{"val1", test_cli::val1}, {"val2", test_cli::val2}, {"val3", test_cli::val3}};
+    test_cli value{test_cli::val3};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(map, CLI::ignore_case, CLI::ignore_underscore));
     args = {"-s", "val_1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "VAL_2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "VAL3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val_4"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 }
 
-TEST_F(TApp, EnumTransformFnPtrMap) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17, val4 = 37 };
-    std::map<std::string, test> map{{"val1", test::val1}, {"val2", test::val2}, {"val3", test::val3}};
-    test value{test::val2};
+TEST_CASE_METHOD(TApp, "EnumTransformFnPtrMap", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17, val4 = 37 };
+    std::map<std::string, test_cli> map{{"val1", test_cli::val1}, {"val2", test_cli::val2}, {"val3", test_cli::val3}};
+    test_cli value{test_cli::val2};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(&map, CLI::ignore_case, CLI::ignore_underscore));
     args = {"-s", "val_1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "VAL_2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "VAL3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val_4"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
-    map["val4"] = test::val4;
+    map["val4"] = test_cli::val4;
     run();
-    EXPECT_EQ(value, test::val4);
+    CHECK(test_cli::val4 == value);
 }
 
-TEST_F(TApp, EnumTransformFnSharedPtrMap) {
-    enum class test : std::int16_t { val1 = 3, val2 = 4, val3 = 17, val4 = 37 };
-    auto map = std::make_shared<std::unordered_map<std::string, test>>();
+TEST_CASE_METHOD(TApp, "EnumTransformFnSharedPtrMap", "[transform]") {
+    enum class test_cli : std::int16_t { val1 = 3, val2 = 4, val3 = 17, val4 = 37 };
+    auto map = std::make_shared<std::unordered_map<std::string, test_cli>>();
     auto &mp = *map;
-    mp["val1"] = test::val1;
-    mp["val2"] = test::val2;
-    mp["val3"] = test::val3;
+    mp["val1"] = test_cli::val1;
+    mp["val2"] = test_cli::val2;
+    mp["val3"] = test_cli::val3;
 
-    test value{test::val2};
+    test_cli value{test_cli::val2};
     auto opt = app.add_option("-s", value)->transform(CLI::Transformer(map, CLI::ignore_case, CLI::ignore_underscore));
     args = {"-s", "val_1"};
     run();
-    EXPECT_EQ(1u, app.count("-s"));
-    EXPECT_EQ(1u, opt->count());
-    EXPECT_EQ(value, test::val1);
+    CHECK(app.count("-s") == 1u);
+    CHECK(opt->count() == 1u);
+    CHECK(test_cli::val1 == value);
 
     args = {"-s", "VAL_2"};
     run();
-    EXPECT_EQ(value, test::val2);
+    CHECK(test_cli::val2 == value);
 
     args = {"-s", "VAL3"};
     run();
-    EXPECT_EQ(value, test::val3);
+    CHECK(test_cli::val3 == value);
 
     args = {"-s", "val_4"};
-    EXPECT_THROW(run(), CLI::ConversionError);
+    CHECK_THROWS_AS(run(), CLI::ConversionError);
 
-    mp["val4"] = test::val4;
+    mp["val4"] = test_cli::val4;
     run();
-    EXPECT_EQ(value, test::val4);
+    CHECK(test_cli::val4 == value);
 }
 
 // Test a cascade of transform functions
-TEST_F(TApp, TransformCascade) {
+TEST_CASE_METHOD(TApp, "TransformCascade", "[transform]") {
 
     std::string output;
     auto opt = app.add_option("-s", output);
@@ -350,23 +351,23 @@ TEST_F(TApp, TransformCascade) {
     opt->check(CLI::IsMember({"abcd", "bbcd", "cbcd"}));
     args = {"-s", "abcd"};
     run();
-    EXPECT_EQ(output, "abcd");
+    CHECK("abcd" == output);
 
     args = {"-s", "Bbc"};
     run();
-    EXPECT_EQ(output, "bbcd");
+    CHECK("bbcd" == output);
 
     args = {"-s", "C_B"};
     run();
-    EXPECT_EQ(output, "cbcd");
+    CHECK("cbcd" == output);
 
     args = {"-s", "A"};
     run();
-    EXPECT_EQ(output, "abcd");
+    CHECK("abcd" == output);
 }
 
 // Test a cascade of transform functions
-TEST_F(TApp, TransformCascadeDeactivate) {
+TEST_CASE_METHOD(TApp, "TransformCascadeDeactivate", "[transform]") {
 
     std::string output;
     auto opt = app.add_option("-s", output);
@@ -380,70 +381,70 @@ TEST_F(TApp, TransformCascadeDeactivate) {
     opt->check(CLI::IsMember({"abcd", "bbcd", "cbcd"}).name("check"));
     args = {"-s", "abcd"};
     run();
-    EXPECT_EQ(output, "abcd");
+    CHECK("abcd" == output);
 
     args = {"-s", "Bbc"};
     run();
-    EXPECT_EQ(output, "bbcd");
+    CHECK("bbcd" == output);
 
     args = {"-s", "C_B"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     auto validator = opt->get_validator("tform2");
-    EXPECT_FALSE(validator->get_active());
-    EXPECT_EQ(validator->get_name(), "tform2");
+    CHECK(!validator->get_active());
+    CHECK("tform2" == validator->get_name());
     validator->active();
-    EXPECT_TRUE(validator->get_active());
+    CHECK(validator->get_active());
     args = {"-s", "C_B"};
     run();
-    EXPECT_EQ(output, "cbcd");
+    CHECK("cbcd" == output);
 
     opt->get_validator("check")->active(false);
     args = {"-s", "gsdgsgs"};
     run();
-    EXPECT_EQ(output, "gsdgsgs");
+    CHECK("gsdgsgs" == output);
 
-    EXPECT_THROW(opt->get_validator("sdfsdf"), CLI::OptionNotFound);
+    CHECK_THROWS_AS(opt->get_validator("sdfsdf"), CLI::OptionNotFound);
 }
 
-TEST_F(TApp, IntTransformFn) {
+TEST_CASE_METHOD(TApp, "IntTransformFn", "[transform]") {
     std::string value;
     app.add_option("-s", value)
         ->transform(
             CLI::CheckedTransformer(std::map<int, int>{{15, 5}, {18, 6}, {21, 7}}, [](int in) { return in - 10; }));
     args = {"-s", "25"};
     run();
-    EXPECT_EQ(value, "5");
+    CHECK("5" == value);
 
     args = {"-s", "6"};
     run();
-    EXPECT_EQ(value, "6");
+    CHECK("6" == value);
 
     args = {"-s", "45"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-s", "val_4"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, IntTransformNonConvertible) {
+TEST_CASE_METHOD(TApp, "IntTransformNonConvertible", "[transform]") {
     std::string value;
     app.add_option("-s", value)->transform(CLI::Transformer(std::map<int, int>{{15, 5}, {18, 6}, {21, 7}}));
     args = {"-s", "15"};
     run();
-    EXPECT_EQ(value, "5");
+    CHECK("5" == value);
 
     args = {"-s", "18"};
     run();
-    EXPECT_EQ(value, "6");
+    CHECK("6" == value);
 
     // value can't be converted to int so it is just ignored
     args = {"-s", "abcd"};
     run();
-    EXPECT_EQ(value, "abcd");
+    CHECK("abcd" == value);
 }
 
-TEST_F(TApp, IntTransformNonMerge) {
+TEST_CASE_METHOD(TApp, "IntTransformNonMerge", "[transform]") {
     std::string value;
     app.add_option("-s", value)
         ->transform(CLI::Transformer(std::map<int, int>{{15, 5}, {18, 6}, {21, 7}}) &
@@ -451,39 +452,39 @@ TEST_F(TApp, IntTransformNonMerge) {
                     "merge");
     args = {"-s", "15"};
     run();
-    EXPECT_EQ(value, "5");
+    CHECK("5" == value);
 
     args = {"-s", "18"};
     run();
-    EXPECT_EQ(value, "6");
+    CHECK("6" == value);
 
     // value can't be converted to int so it is just ignored
     args = {"-s", "abcd"};
     run();
-    EXPECT_EQ(value, "abcd");
+    CHECK("abcd" == value);
 
     args = {"-s", "25"};
     run();
-    EXPECT_EQ(value, "5");
+    CHECK("5" == value);
 
     args = {"-s", "31"};
     run();
-    EXPECT_EQ(value, "7");
+    CHECK("7" == value);
 
     auto help = app.help();
-    EXPECT_TRUE(help.find("15->5") != std::string::npos);
-    EXPECT_TRUE(help.find("25->5") != std::string::npos);
+    CHECK(help.find("15->5") != std::string::npos);
+    CHECK(help.find("25->5") != std::string::npos);
 
     auto validator = app.get_option("-s")->get_validator();
     help = validator->get_description();
-    EXPECT_TRUE(help.find("15->5") != std::string::npos);
-    EXPECT_TRUE(help.find("25->5") != std::string::npos);
+    CHECK(help.find("15->5") != std::string::npos);
+    CHECK(help.find("25->5") != std::string::npos);
 
     auto validator2 = app.get_option("-s")->get_validator("merge");
-    EXPECT_EQ(validator2, validator);
+    CHECK(validator == validator2);
 }
 
-TEST_F(TApp, IntTransformMergeWithCustomValidator) {
+TEST_CASE_METHOD(TApp, "IntTransformMergeWithCustomValidator", "[transform]") {
     std::string value;
     auto opt = app.add_option("-s", value)
                    ->transform(CLI::Transformer(std::map<int, int>{{15, 5}, {18, 6}, {21, 7}}) |
@@ -498,57 +499,57 @@ TEST_F(TApp, IntTransformMergeWithCustomValidator) {
                                "check");
     args = {"-s", "15"};
     run();
-    EXPECT_EQ(value, "5");
+    CHECK("5" == value);
 
     args = {"-s", "18"};
     run();
-    EXPECT_EQ(value, "6");
+    CHECK("6" == value);
 
     // value can't be converted to int so it is just ignored
     args = {"-s", "frog"};
     run();
-    EXPECT_EQ(value, "hops");
+    CHECK("hops" == value);
 
     args = {"-s", "25"};
     run();
-    EXPECT_EQ(value, "25");
+    CHECK("25" == value);
 
     auto help = app.help();
-    EXPECT_TRUE(help.find("15->5") != std::string::npos);
-    EXPECT_TRUE(help.find("OR") == std::string::npos);
+    CHECK(help.find("15->5") != std::string::npos);
+    CHECK(help.find("OR") == std::string::npos);
 
     auto validator = opt->get_validator("check");
-    EXPECT_EQ(validator->get_name(), "check");
+    CHECK("check" == validator->get_name());
     validator->active(false);
     help = app.help();
-    EXPECT_TRUE(help.find("15->5") == std::string::npos);
+    CHECK(help.find("15->5") == std::string::npos);
 }
 
-TEST_F(TApp, BoundTests) {
+TEST_CASE_METHOD(TApp, "BoundTests", "[transform]") {
     double value;
     app.add_option("-s", value)->transform(CLI::Bound(3.4, 5.9));
     args = {"-s", "15"};
     run();
-    EXPECT_EQ(value, 5.9);
+    CHECK(5.9 == value);
 
     args = {"-s", "3.689"};
     run();
-    EXPECT_EQ(value, std::stod("3.689"));
+    CHECK(std::stod("3.689") == value);
 
     // value can't be converted to int so it is just ignored
     args = {"-s", "abcd"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-s", "2.5"};
     run();
-    EXPECT_EQ(value, 3.4);
+    CHECK(3.4 == value);
 
     auto help = app.help();
-    EXPECT_TRUE(help.find("bounded to") != std::string::npos);
-    EXPECT_TRUE(help.find("[3.4 - 5.9]") != std::string::npos);
+    CHECK(help.find("bounded to") != std::string::npos);
+    CHECK(help.find("[3.4 - 5.9]") != std::string::npos);
 }
 
-TEST_F(TApp, NumberWithUnitCorrectlySplitNumber) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitCorrectlySplitNumber", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"b", 100}, {"cc", 1000}};
 
     int value = 0;
@@ -556,43 +557,43 @@ TEST_F(TApp, NumberWithUnitCorrectlySplitNumber) {
 
     args = {"-n", "42"};
     run();
-    EXPECT_EQ(value, 42);
+    CHECK(42 == value);
 
     args = {"-n", "42a"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "  42  cc  "};
     run();
-    EXPECT_EQ(value, 42000);
+    CHECK(42000 == value);
     args = {"-n", "  -42  cc  "};
     run();
-    EXPECT_EQ(value, -42000);
+    CHECK(-42000 == value);
 }
 
-TEST_F(TApp, NumberWithUnitFloatTest) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitFloatTest", "[transform]") {
     std::map<std::string, double> mapping{{"a", 10}, {"b", 100}, {"cc", 1000}};
     double value{0.0};
     app.add_option("-n", value)->transform(CLI::AsNumberWithUnit(mapping));
 
     args = {"-n", "42"};
     run();
-    EXPECT_DOUBLE_EQ(value, 42);
+    CHECK(42 == Approx(value));
 
     args = {"-n", ".5"};
     run();
-    EXPECT_DOUBLE_EQ(value, .5);
+    CHECK(.5 == Approx(value));
 
     args = {"-n", "42.5 a"};
     run();
-    EXPECT_DOUBLE_EQ(value, 425);
+    CHECK(425 == Approx(value));
 
     args = {"-n", "42.cc"};
     run();
-    EXPECT_DOUBLE_EQ(value, 42000);
+    CHECK(42000 == Approx(value));
 }
 
-TEST_F(TApp, NumberWithUnitCaseSensitive) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitCaseSensitive", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"A", 100}};
 
     int value{0};
@@ -600,14 +601,14 @@ TEST_F(TApp, NumberWithUnitCaseSensitive) {
 
     args = {"-n", "42a"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "42A"};
     run();
-    EXPECT_EQ(value, 4200);
+    CHECK(4200 == value);
 }
 
-TEST_F(TApp, NumberWithUnitCaseInsensitive) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitCaseInsensitive", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"B", 100}};
 
     int value{0};
@@ -615,22 +616,22 @@ TEST_F(TApp, NumberWithUnitCaseInsensitive) {
 
     args = {"-n", "42a"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "42A"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "42b"};
     run();
-    EXPECT_EQ(value, 4200);
+    CHECK(4200 == value);
 
     args = {"-n", "42B"};
     run();
-    EXPECT_EQ(value, 4200);
+    CHECK(4200 == value);
 }
 
-TEST_F(TApp, NumberWithUnitMandatoryUnit) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitMandatoryUnit", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"A", 100}};
 
     int value{0};
@@ -641,17 +642,17 @@ TEST_F(TApp, NumberWithUnitMandatoryUnit) {
 
     args = {"-n", "42a"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "42A"};
     run();
-    EXPECT_EQ(value, 4200);
+    CHECK(4200 == value);
 
     args = {"-n", "42"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NumberWithUnitMandatoryUnit2) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitMandatoryUnit2", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"B", 100}};
 
     int value{0};
@@ -662,49 +663,49 @@ TEST_F(TApp, NumberWithUnitMandatoryUnit2) {
 
     args = {"-n", "42A"};
     run();
-    EXPECT_EQ(value, 420);
+    CHECK(420 == value);
 
     args = {"-n", "42b"};
     run();
-    EXPECT_EQ(value, 4200);
+    CHECK(4200 == value);
 
     args = {"-n", "42"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NumberWithUnitBadMapping) {
-    EXPECT_THROW(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"A", 100}},
-                                       CLI::AsNumberWithUnit::CASE_INSENSITIVE),
-                 CLI::ValidationError);
-    EXPECT_THROW(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"9", 100}}), CLI::ValidationError);
-    EXPECT_THROW(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"AA A", 100}}), CLI::ValidationError);
-    EXPECT_THROW(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"", 100}}), CLI::ValidationError);
+TEST_CASE_METHOD(TApp, "NumberWithUnitBadMapping", "[transform]") {
+    CHECK_THROWS_AS(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"A", 100}},
+                                          CLI::AsNumberWithUnit::CASE_INSENSITIVE),
+                    CLI::ValidationError);
+    CHECK_THROWS_AS(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"9", 100}}), CLI::ValidationError);
+    CHECK_THROWS_AS(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"AA A", 100}}), CLI::ValidationError);
+    CHECK_THROWS_AS(CLI::AsNumberWithUnit(std::map<std::string, int>{{"a", 10}, {"", 100}}), CLI::ValidationError);
 }
 
-TEST_F(TApp, NumberWithUnitBadInput) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitBadInput", "[transform]") {
     std::map<std::string, int> mapping{{"a", 10}, {"b", 100}};
 
     int value{0};
     app.add_option("-n", value)->transform(CLI::AsNumberWithUnit(mapping));
 
     args = {"-n", "13 a b"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args = {"-n", "13 c"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args = {"-n", "a"};
     // Assume 1.0 unit
-    EXPECT_NO_THROW(run());
+    CHECK_NOTHROW(run());
     args = {"-n", "12.0a"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args = {"-n", "a5"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args = {"-n", ""};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
     args = {"-n", "13 a-"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NumberWithUnitIntOverflow) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitIntOverflow", "[transform]") {
     std::map<std::string, int> mapping{{"a", 1000000}, {"b", 100}, {"c", 101}};
 
     std::int32_t value;
@@ -712,147 +713,147 @@ TEST_F(TApp, NumberWithUnitIntOverflow) {
 
     args = {"-n", "1000 a"};
     run();
-    EXPECT_EQ(value, 1000000000);
+    CHECK(1000000000 == value);
 
     args = {"-n", "1000000 a"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-n", "-1000000 a"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-n", "21474836 b"};
     run();
-    EXPECT_EQ(value, 2147483600);
+    CHECK(2147483600 == value);
 
     args = {"-n", "21474836 c"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 }
 
-TEST_F(TApp, NumberWithUnitFloatOverflow) {
+TEST_CASE_METHOD(TApp, "NumberWithUnitFloatOverflow", "[transform]") {
     std::map<std::string, float> mapping{{"a", 2.f}, {"b", 1.f}, {"c", 0.f}};
 
     float value{0.0F};
     app.add_option("-n", value)->transform(CLI::AsNumberWithUnit(mapping));
 
     args = {"-n", "3e+38 a"};
-    EXPECT_THROW(run(), CLI::ValidationError);
+    CHECK_THROWS_AS(run(), CLI::ValidationError);
 
     args = {"-n", "3e+38 b"};
     run();
-    EXPECT_FLOAT_EQ(value, 3e+38f);
+    CHECK(3e+38f == Approx(value));
 
     args = {"-n", "3e+38 c"};
     run();
-    EXPECT_FLOAT_EQ(value, 0.f);
+    CHECK(0.f == Approx(value));
 }
 
-TEST_F(TApp, AsSizeValue1000_1024) {
+TEST_CASE_METHOD(TApp, "AsSizeValue1000_1024", "[transform]") {
     std::uint64_t value{0};
     app.add_option("-s", value)->transform(CLI::AsSizeValue(true));
 
     args = {"-s", "10240"};
     run();
-    EXPECT_EQ(value, 10240u);
+    CHECK(10240u == value);
 
     args = {"-s", "1b"};
     run();
-    EXPECT_EQ(value, 1u);
+    CHECK(1u == value);
 
     std::uint64_t k_value{1000u};
     std::uint64_t ki_value{1024u};
     args = {"-s", "1k"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1kb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1 Kb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1ki"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1kib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     k_value = 1000ull * 1000u;
     ki_value = 1024ull * 1024u;
     args = {"-s", "1m"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1mb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1mi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1mib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     k_value = 1000ull * 1000u * 1000u;
     ki_value = 1024ull * 1024u * 1024u;
     args = {"-s", "1g"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1gb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1gi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1gib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     k_value = 1000ull * 1000u * 1000u * 1000u;
     ki_value = 1024ull * 1024u * 1024u * 1024u;
     args = {"-s", "1t"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1tb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1ti"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1tib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     k_value = 1000ull * 1000u * 1000u * 1000u * 1000u;
     ki_value = 1024ull * 1024u * 1024u * 1024u * 1024u;
     args = {"-s", "1p"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1pb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1pi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1pib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     k_value = 1000ull * 1000u * 1000u * 1000u * 1000u * 1000u;
     ki_value = 1024ull * 1024u * 1024u * 1024u * 1024u * 1024u;
     args = {"-s", "1e"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1eb"};
     run();
-    EXPECT_EQ(value, k_value);
+    CHECK(k_value == value);
     args = {"-s", "1ei"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1eib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 }
 
-TEST_F(TApp, duration_test) {
+TEST_CASE_METHOD(TApp, "duration_test", "[transform]") {
     std::chrono::seconds duration{1};
 
     app.option_defaults()->ignore_case();
@@ -863,107 +864,107 @@ TEST_F(TApp, duration_test) {
         ->capture_default_str()
         ->transform(CLI::AsNumberWithUnit(
             std::map<std::string, std::size_t>{{"sec", 1}, {"min", 60}, {"h", 3600}, {"day", 24 * 3600}}));
-    EXPECT_NO_THROW(app.parse(std::vector<std::string>{"1 day", "--duration"}));
+    CHECK_NOTHROW(app.parse(std::vector<std::string>{"1 day", "--duration"}));
 
-    EXPECT_EQ(duration, std::chrono::seconds(86400));
+    CHECK(std::chrono::seconds(86400) == duration);
 }
 
-TEST_F(TApp, AsSizeValue1024) {
+TEST_CASE_METHOD(TApp, "AsSizeValue1024", "[transform]") {
     std::uint64_t value{0};
     app.add_option("-s", value)->transform(CLI::AsSizeValue(false));
 
     args = {"-s", "10240"};
     run();
-    EXPECT_EQ(value, 10240u);
+    CHECK(10240u == value);
 
     args = {"-s", "1b"};
     run();
-    EXPECT_EQ(value, 1u);
+    CHECK(1u == value);
 
     std::uint64_t ki_value{1024u};
     args = {"-s", "1k"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1kb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1 Kb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1ki"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1kib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     ki_value = 1024ull * 1024u;
     args = {"-s", "1m"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1mb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1mi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1mib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     ki_value = 1024ull * 1024u * 1024u;
     args = {"-s", "1g"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1gb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1gi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1gib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     ki_value = 1024ull * 1024u * 1024u * 1024u;
     args = {"-s", "1t"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1tb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1ti"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1tib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     ki_value = 1024ull * 1024u * 1024u * 1024u * 1024u;
     args = {"-s", "1p"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1pb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1pi"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1pib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 
     ki_value = 1024ull * 1024u * 1024u * 1024u * 1024u * 1024u;
     args = {"-s", "1e"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1eb"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1ei"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
     args = {"-s", "1eib"};
     run();
-    EXPECT_EQ(value, ki_value);
+    CHECK(ki_value == value);
 }
diff --git a/packages/CLI11/tests/TrueFalseTest.cpp b/packages/CLI11/tests/TrueFalseTest.cpp
index 7c37d456dc79c9cacb409573a3ce184e02a90a8c..aa8886c092ab21c1b464ec833c3fd00924378924 100644
--- a/packages/CLI11/tests/TrueFalseTest.cpp
+++ b/packages/CLI11/tests/TrueFalseTest.cpp
@@ -6,31 +6,24 @@
 
 #include "app_helper.hpp"
 
-/// This allows a set of strings to be run over by a test
-struct TApp_TBO : public TApp_base, testing::TestWithParam<const char *> {};
-
-TEST_P(TApp_TBO, TrueBoolOption) {
+TEST_CASE_METHOD(TApp, "True Bool Option", "[bool][flag]") {
+    // Strings needed here due to MSVC 2015.
+    auto param = GENERATE(as<std::string>{}, "true", "on", "True", "ON");
     bool value{false};  // Not used, but set just in case
     app.add_option("-b,--bool", value);
-    args = {"--bool", GetParam()};
+    args = {"--bool", param};
     run();
-    EXPECT_EQ(1u, app.count("--bool"));
-    EXPECT_TRUE(value);
+    CHECK(app.count("--bool") == 1u);
+    CHECK(value);
 }
 
-// Change to INSTANTIATE_TEST_SUITE_P in GTest master
-INSTANTIATE_TEST_SUITE_P(TrueBoolOptions_test, TApp_TBO, testing::Values("true", "on", "True", "ON"));
-
-/// This allows a set of strings to be run over by a test
-struct TApp_FBO : public TApp_base, public ::testing::TestWithParam<const char *> {};
+TEST_CASE_METHOD(TApp, "False Bool Option", "[bool][flag]") {
+    auto param = GENERATE(as<std::string>{}, "false", "off", "False", "OFF");
 
-TEST_P(TApp_FBO, FalseBoolOptions) {
     bool value{true};  // Not used, but set just in case
     app.add_option("-b,--bool", value);
-    args = {"--bool", GetParam()};
+    args = {"--bool", param};
     run();
-    EXPECT_EQ(1u, app.count("--bool"));
-    EXPECT_FALSE(value);
+    CHECK(app.count("--bool") == 1u);
+    CHECK_FALSE(value);
 }
-
-INSTANTIATE_TEST_SUITE_P(FalseBoolOptions_test, TApp_FBO, ::testing::Values("false", "off", "False", "OFF"));
diff --git a/packages/CLI11/tests/WindowsTest.cpp b/packages/CLI11/tests/WindowsTest.cpp
index 41053bbe9212c6c1f6134c9e230d22c77a04cc3f..847266649965e6fc03176cda5862d46d07dbe103 100644
--- a/packages/CLI11/tests/WindowsTest.cpp
+++ b/packages/CLI11/tests/WindowsTest.cpp
@@ -10,10 +10,10 @@
 // This test verifies that CLI11 still works if
 // Windows.h is included. #145
 
-TEST_F(TApp, WindowsTestSimple) {
+TEST_CASE_METHOD(TApp, "WindowsTestSimple", "[windows]") {
     app.add_flag("-c,--count");
     args = {"-c"};
     run();
-    EXPECT_EQ(1u, app.count("-c"));
-    EXPECT_EQ(1u, app.count("--count"));
+    CHECK(app.count("-c") == 1u);
+    CHECK(app.count("--count") == 1u);
 }
diff --git a/packages/CLI11/tests/app_helper.hpp b/packages/CLI11/tests/app_helper.hpp
index 6b250a422cb5f13eb549d9a5bf9d91f58f029953..0f72adda27064e17d3cb97f242526e5b1b2b4501 100644
--- a/packages/CLI11/tests/app_helper.hpp
+++ b/packages/CLI11/tests/app_helper.hpp
@@ -12,7 +12,7 @@
 #include "CLI/CLI.hpp"
 #endif
 
-#include "gtest/gtest.h"
+#include "catch.hpp"
 #include <iostream>
 #include <string>
 #include <utility>
@@ -20,11 +20,11 @@
 
 using input_t = std::vector<std::string>;
 
-class TApp_base {
+class TApp {
   public:
     CLI::App app{"My Test Program"};
     input_t args{};
-    virtual ~TApp_base() = default;
+    virtual ~TApp() = default;
     void run() {
         // It is okay to re-parse - clear is called automatically before a parse.
         input_t newargs = args;
@@ -33,8 +33,6 @@ class TApp_base {
     }
 };
 
-class TApp : public TApp_base, public ::testing::Test {};
-
 class TempFile {
     std::string _name{};
 
diff --git a/packages/CLI11/tests/link_test_2.cpp b/packages/CLI11/tests/link_test_2.cpp
index ba4cc8fe1d01be7bce5276c8b3fcd5349afb7790..b8544ab47a8104b831b66cd409490c200261bcb7 100644
--- a/packages/CLI11/tests/link_test_2.cpp
+++ b/packages/CLI11/tests/link_test_2.cpp
@@ -6,12 +6,12 @@
 
 #include "CLI/CLI.hpp"
 #include "CLI/Timer.hpp"
-#include <gtest/gtest.h>
+#include "catch.hpp"
 
 int do_nothing();
 
 // Verifies there are no unguarded inlines
-TEST(Link, DoNothing) {
+TEST_CASE("Link: DoNothing", "[link]") {
     int a = do_nothing();
-    EXPECT_EQ(7, a);
+    CHECK(a == 7);
 }
diff --git a/packages/CLI11/tests/main.cpp b/packages/CLI11/tests/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1c2725f53e260069f3cc867bb1398ac21cb4b2f
--- /dev/null
+++ b/packages/CLI11/tests/main.cpp
@@ -0,0 +1,8 @@
+// Copyright (c) 2017-2020, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/packages/PEGTL/.github/workflows/clang-analyze.yml b/packages/PEGTL/.github/workflows/clang-analyze.yml
index 2226ea7b00338b26996a6ea8bc19d93eb10e4a8e..523b105564db92155b5cac93281e9e5d36776868 100644
--- a/packages/PEGTL/.github/workflows/clang-analyze.yml
+++ b/packages/PEGTL/.github/workflows/clang-analyze.yml
@@ -4,7 +4,7 @@ on: [push, pull_request]
 
 jobs:
   clang-analyze:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v2
diff --git a/packages/PEGTL/.github/workflows/clang-tidy.yml b/packages/PEGTL/.github/workflows/clang-tidy.yml
index 41e1003c48514127943e3f2da59e119b7c75e670..4da8af6ff8b1695835e65404a825f51d673bc49a 100644
--- a/packages/PEGTL/.github/workflows/clang-tidy.yml
+++ b/packages/PEGTL/.github/workflows/clang-tidy.yml
@@ -4,7 +4,7 @@ on: [push, pull_request]
 
 jobs:
   clang-tidy:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v2
diff --git a/packages/PEGTL/.github/workflows/linux.yml b/packages/PEGTL/.github/workflows/linux.yml
index ba30d3d98d9b4753649bd71fc5bdfa984c05971e..139982af19acf195bf688fc94db6ac22e85b841e 100644
--- a/packages/PEGTL/.github/workflows/linux.yml
+++ b/packages/PEGTL/.github/workflows/linux.yml
@@ -8,16 +8,14 @@ jobs:
       fail-fast: false
       matrix:
         compiler:
-          - g++-7
-          - g++-8
           - g++-9
           - g++-10
-          - clang++-8
           - clang++-9
           - clang++-10
+          - clang++-11
         build_type: [Debug, Release]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     env:
       CXX: ${{ matrix.compiler }}
@@ -36,22 +34,34 @@ jobs:
     - working-directory: build/
       run: ctest --output-on-failure
 
-  linux-gcc-extra:
+  linux-old:
     strategy:
       fail-fast: false
       matrix:
-        flags: ["-fno-rtti"]
+        compiler:
+          - g++-7
+          - g++-8
+          - clang++-6.0
+          - clang++-7
+          - clang++-8
         build_type: [Debug, Release]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
+
+    env:
+      CXX: ${{ matrix.compiler }}
 
     steps:
     - uses: actions/checkout@v2
 
+    - run: sudo apt-get update
+
+    - run: sudo apt-get install -y ${{ matrix.compiler }}
+
     - run: cmake -E make_directory build
 
     - working-directory: build/
-      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_FLAGS="${{ matrix.flags }}"
+      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
 
     - working-directory: build/
       run: cmake --build .
@@ -59,17 +69,14 @@ jobs:
     - working-directory: build/
       run: ctest --output-on-failure
 
-  linux-clang-extra:
+  linux-gcc-extra:
     strategy:
       fail-fast: false
       matrix:
-        flags: ["-fno-rtti", "-fms-extensions"]
+        flags: ["-fno-rtti"]
         build_type: [Debug, Release]
 
-    runs-on: ubuntu-20.04
-
-    env:
-      CXX: clang++
+    runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v2
@@ -85,32 +92,25 @@ jobs:
     - working-directory: build/
       run: ctest --output-on-failure
 
-  clang-conanio:
+  linux-clang-extra:
     strategy:
       fail-fast: false
       matrix:
-        image:
-          # List: https://github.com/conan-io/conan-docker-tools
-          - clang50
-          - clang60
-          - clang7
-          - clang9-x86
-          - clang11
+        flags: ["-fno-rtti", "-fms-extensions"]
         build_type: [Debug, Release]
 
-    container:
-      image: conanio/${{ matrix.image }}
-      options: --user root
-
     runs-on: ubuntu-latest
 
+    env:
+      CXX: clang++
+
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v2
 
     - run: cmake -E make_directory build
 
     - working-directory: build/
-      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_FLAGS="${{ matrix.flags }}"
 
     - working-directory: build/
       run: cmake --build .
diff --git a/packages/PEGTL/.github/workflows/no-exceptions.yml b/packages/PEGTL/.github/workflows/no-exceptions.yml
index 3dd3d74b835d02a9b06dfa404b44793ac44900a3..eeb964f2540b1a6711afc9c689f155c7ee63b827 100644
--- a/packages/PEGTL/.github/workflows/no-exceptions.yml
+++ b/packages/PEGTL/.github/workflows/no-exceptions.yml
@@ -7,10 +7,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        compiler: [g++-10, clang++-10]
+        compiler: [g++, clang++]
         build_type: [Debug, Release]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     env:
       CXX: ${{ matrix.compiler }}
diff --git a/packages/PEGTL/.github/workflows/sanitizer.yml b/packages/PEGTL/.github/workflows/sanitizer.yml
index 1f478cb067072b19416fde873c39725e1a3f03d9..61d61aebdfa1b6640934ed95dfeeb4581a6e8b4a 100644
--- a/packages/PEGTL/.github/workflows/sanitizer.yml
+++ b/packages/PEGTL/.github/workflows/sanitizer.yml
@@ -10,7 +10,7 @@ jobs:
         cxx: [g++, clang++]
         sanitizer: [address, undefined]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     env:
       CXX: ${{ matrix.cxx }}
diff --git a/packages/PEGTL/.gitrepo b/packages/PEGTL/.gitrepo
index 393af709f67ffe786ffbcab99ba7af6772acfce4..d60e16e83cde84179e4ec27115e58d51dd0cd242 100644
--- a/packages/PEGTL/.gitrepo
+++ b/packages/PEGTL/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:taocpp/PEGTL.git
 	branch = master
-	commit = 57f8ebe0045d7e35cbb251536146a57bc0cf9db5
-	parent = 0a259f7e3e4fe2364b8d45b641c7f48ff3bc7341
-	cmdver = 0.4.3
+	commit = c131c2e2aad67037285ef39d11ec4f1d28d4fc73
+	parent = 2f2fa0e22bd114f44f78c5bee89bc13bd0959d1d
 	method = merge
+	cmdver = 0.4.3
diff --git a/packages/PEGTL/README.md b/packages/PEGTL/README.md
index 64d090c994d932cb813415879576ff82443506ed..e613c97f9b5a542d542103b3251bf9067806bdc6 100644
--- a/packages/PEGTL/README.md
+++ b/packages/PEGTL/README.md
@@ -112,6 +112,7 @@ In appreciation of all contributions here are the people that have [directly con
 [<img alt="pauloscustodio" src="https://avatars.githubusercontent.com/u/70773" width="120">](https://github.com/pauloscustodio)
 [<img alt="pleroux0" src="https://avatars.githubusercontent.com/u/39619854" width="120">](https://github.com/pleroux0)
 [<img alt="quadfault" src="https://avatars.githubusercontent.com/u/30195320" width="120">](https://github.com/quadfault)
+[<img alt="quarticcat" src="https://avatars.githubusercontent.com/u/70888415" width="120">](https://github.com/quarticcat)
 [<img alt="ras0219" src="https://avatars.githubusercontent.com/u/533828" width="120">](https://github.com/ras0219)
 [<img alt="redmercury" src="https://avatars.githubusercontent.com/u/4424222" width="120">](https://github.com/redmercury)
 [<img alt="robertcampion" src="https://avatars.githubusercontent.com/u/4220569" width="120">](https://github.com/robertcampion)
diff --git a/packages/PEGTL/doc/Actions-and-States.md b/packages/PEGTL/doc/Actions-and-States.md
index 951720b7d7fcb073b47a94ac797e4b5c684a26d6..4c8101682972bde687cad06b2a1d96024e7afd79 100644
--- a/packages/PEGTL/doc/Actions-and-States.md
+++ b/packages/PEGTL/doc/Actions-and-States.md
@@ -24,6 +24,7 @@ When an action is *applied*, the corresponding function receives the *states*, a
 * [Changing Actions and States](#changing-actions-and-states)
 * [Match](#match)
 * [Nothing](#nothing)
+* [Backtracking](#backtracking)
 * [Troubleshooting](#troubleshooting)
   * [Boolean Return](#boolean-return)
   * [State Mismatch](#state-mismatch)
@@ -285,7 +286,7 @@ std::string unescape( const std::string& escaped )
 
 At the end of the parsing run, the complete unescaped string can be found in the aptly named variable `unescaped`.
 
-A more complete example of how to unescape strings can be found in `src/examples/pegtl/unescape.cpp`.
+A more complete example of how to unescape strings can be found in `src/example/pegtl/unescape.cpp`.
 
 ## Specialising
 
@@ -506,8 +507,72 @@ For example when a class `b` is derived from `change_state`, it also gains that
 At this point `b` is allowed to either have or not have an `apply()` or `apply0()`.
 By letting `b` also derive from one of the three mentioned classes, the `maybe_nothing` will be ignored and `b` will be checked to have or not have the functions as dictated by the respective additional base class.
 
+## Backtracking
+
+Sometimes there can be *backtracking* during a parsing run which can lead to Actions being called in places where their effects are undesired.
+While it might be intuitively clear what backtracking is, for the purpose of the following discussion we give a slightly more formal definition.
+
+We speak of *backtracking* across a rule `S` when there is a rule `R` of which `S` is a (direct or indirect) sub-rule and during a parsing run
+1. `R` returns local failure after
+2. `S` succeeded and its success is a requirement for the success of `R` and
+3. it is "still possible" for the top-level grammar rule of the parsing run to succeed.
+
+In this case the input will have been rewound to the point at which `R` was attempted to match and all effects of `S` on the Input will have been undone, however, and this is the subject of this section, any action attached to `S` will have been already performed without there being an automatic "undo".
+
+#### The AAC-Problem
+
+In some cases it is easy to rewrite the grammar in a way that prevents backtracking.
+This simultaneously removes the issue of having to undo actions and improves parsing performance.
+
+The prototypical case for which such a rewrite can be done is `R = sor< seq< A, B >, seq< A, C > >` where `A`, `B` and `C` are arbitrary rules.
+
+If during a parsing run there are actions attached to `A` and `C`, and the input matches `seq< A, C >` but not `seq< A, B >`, then the action for `A` will be called *twice* before the action for `C`, which gives this problem its "AAC" name, given that what happens is:
+
+* Begin `sor< seq< A, B >, seq< A, C > >`
+* Begin `seq< A, B >`
+* Begin `A`
+* Success `A` with action called
+* Begin `B`
+* Failure `B`
+* Failure `seq< A, B >`
+* Begin `seq< A, C >`
+* Begin `A` at the same position as the begin `A` above
+* Success `A` with action called again on the same input
+* Begin `C`
+* Success `C`
+* Success `seq< A, C >`
+* Success `sor< seq< A, B >, seq< A, C > >`
+
+#### Rewriting
+
+In practice the structure of the rule might be more complicated than the pure AAC-problem which will make it harder to recognise the pattern.
+One solution is to rewrite `R` as `R' = seq< A, sor< B, C > >` where of course any action for `A` will be called at most once for every successful match of `R'`.
+
+#### Manual Undo 
+
+Another solution is to undo the effects of the Action attached to `A` in case the encompassing `seq< A, B >` (or `seq< A, C >`) fail.
+
+The advantage of this approach is that the implementation of the Action for `A` can pretend that is only called when really needed.
+The disadvantage is that there is no function on the Action that is called in the case of failure which requires the user to either write a custom `match()` function in the Action for `seq< A, B >` or to implement the `failure()` function in a custom [Control class](Control-and-Debug.md).
+
+#### Manual Commit
+
+A further solution is to let the Action for `A` perform its job "to the side", and only "commit" the effects to the target data structure in the Action for `seq< A, B >`.
+
+For example if the Action attached to `A` takes the matched portion of the Input as `std::string` and appends it to `std::vector< std::string >` one could change said Action for `A` to only fill some temporary string in one of the States, and create an Action for `seq< A, B >` that, after it is called on success of that rule, appends the aforementioned temporary string to the target vector.
+
+#### Looking Ahead
+
+When everything else fails and a quick-and-dirty solution to Actions being called too often in the presence of backtracking is required and/or performance is not of prime importance it is relatively easy to solve the problem by employing the infinite look-ahead capability of PEGs.
+
+When backtracking across `S` is a problem because an Action attached to `S` can be called when `S` succeeds even though there is a higher-up rule `R` that can still fail then simply replace `R` with `seq< at< R >, R >` in the grammar.
+
+Remembering that `at` disables all Actions explains how this solves the problem; we first verify without Actions that `R` will indeed match at this point and only then match `R` again with Actions enabled.
+
 ## Troubleshooting
 
+The following lists a couple of frequently encountered Action-related errors and how to fix them.
+
 ### Boolean Return
 
 Actions returning `bool` are an advanced use case that should be used with caution.
diff --git a/packages/PEGTL/doc/Changelog.md b/packages/PEGTL/doc/Changelog.md
index d1b57f131b5aa72f8221d024e3a2c705ed8bc77d..cc194318f8cd00dc5a894e713003369036b02506 100644
--- a/packages/PEGTL/doc/Changelog.md
+++ b/packages/PEGTL/doc/Changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 3.2.1
+
+**Not yet released**
+
+* Added an optional limiter to guard against infinite recursion.
+
 ## 3.2.0
 
 Released 2021-01-15
diff --git a/packages/PEGTL/doc/Errors-and-Exceptions.md b/packages/PEGTL/doc/Errors-and-Exceptions.md
index f080c0917885c1f9b1793e81acaf4222f89dee3d..7b8f018d84e757c919c2eae8f91864ccac87a060 100644
--- a/packages/PEGTL/doc/Errors-and-Exceptions.md
+++ b/packages/PEGTL/doc/Errors-and-Exceptions.md
@@ -162,7 +162,7 @@ This is often insufficient and one would like to provide more meaningful error m
 
 A practical technique to provide customised error messages for all `must<>` error points uses the `must_if<>` helper.
 
-For an example of this method see `src/examples/pegtl/json_errors.hpp`, where all errors that might occur in the supplied JSON grammar are customised like this:
+For an example of this method see `src/example/pegtl/json_errors.hpp`, where all errors that might occur in the supplied JSON grammar are customised like this:
 
 ```c++
 template< typename > inline constexpr const char* error_message = nullptr;
diff --git a/packages/PEGTL/doc/Grammar-Analysis.md b/packages/PEGTL/doc/Grammar-Analysis.md
index 964adb9c19e27963a9ec98202208e98b086e7894..629982d8ec624f154c86fb95e16edacec6704a5a 100644
--- a/packages/PEGTL/doc/Grammar-Analysis.md
+++ b/packages/PEGTL/doc/Grammar-Analysis.md
@@ -56,7 +56,7 @@ This support automatically extends to all custom rules built "the usual way" via
 
 For true custom rules, i.e. rules that implement their own `match()` function, the following steps need to be taken for them to work with the grammar analysis.
 
-1. The rule needs a `rule_t` that, usually for true custom rules, is a type alias for the grammar rule itself.
+1. The rule needs a [`rule_t`](Meta-Data-and-Visit.md#rule-type) that, usually for true custom rules, is a type alias for the grammar rule itself.
 2. There needs to be a specialisation of the `analyze_traits<>` for the custom rule, with an additional first template parameter:
 
 Assuming a custom rule like the following
diff --git a/packages/PEGTL/doc/Parse-Tree.md b/packages/PEGTL/doc/Parse-Tree.md
index 039208294c18c3a201492e6f8c9da75affcc5bd5..baa3b24a7fc1e4e54daac1e1e56d0f461900f336 100644
--- a/packages/PEGTL/doc/Parse-Tree.md
+++ b/packages/PEGTL/doc/Parse-Tree.md
@@ -18,6 +18,7 @@ It provides the basic infrastructure to build a parse tree that
 * [Transformers](#transformers)
 * [`tao::pegtl::parse_tree::node`](#taopegtlparse_treenode)
 * [Custom Node Class](#custom-node-class)
+* [Requirements](#requirements)
 
 ## Full Parse Tree
 
@@ -235,4 +236,8 @@ struct my_node
 };
 ```
 
+## Requirements
+
+The parse tree uses a rule's meta data supplied by [`subs_t`](Meta-Data-and-Visit.md#sub-rules) for internal optimizations.
+
 Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
diff --git a/packages/PEGTL/doc/README.md b/packages/PEGTL/doc/README.md
index 09b6fbcf15a32a3202faaa073dd9c96f1fbde813..7c028ccae0f67e1a8581605c48f4fb5f44491f1a 100644
--- a/packages/PEGTL/doc/README.md
+++ b/packages/PEGTL/doc/README.md
@@ -44,6 +44,7 @@
   * [Changing Actions and States](Actions-and-States.md#changing-actions-and-states)
   * [Match](Actions-and-States.md#match)
   * [Nothing](Actions-and-States.md#nothing)
+  * [Backtracking](Actions-and-States.md#backtracking)
   * [Troubleshooting](Actions-and-States.md#troubleshooting)
     * [Boolean Return](Actions-and-States.md#boolean-return)
     * [State Mismatch](Actions-and-States.md#state-mismatch)
@@ -103,6 +104,7 @@
   * [Transformer](Parse-Tree.md#transformer)
   * [`tao::pegtl::parse_tree::node`](Parse-Tree.md#taopegtlparse_treenode)
   * [Custom Node Class](Parse-Tree.md#custom-node-class)
+  * [Requirements](Parse-Tree.md#requirements)
 * [Meta Data and Visit](Meta-Data-and-Visit.md)
   * [Internals](Meta-Data-and-Visit.md#internals)
   * [Rule Type](Meta-Data-and-Visit.md#rule-type)
diff --git a/packages/PEGTL/doc/Rule-Reference.md b/packages/PEGTL/doc/Rule-Reference.md
index a39700edf41e97cddcce989d0d67cd0c5cefe1eb..79dd46ccf969050b4f40fbafc9ebe56bee2c5699 100644
--- a/packages/PEGTL/doc/Rule-Reference.md
+++ b/packages/PEGTL/doc/Rule-Reference.md
@@ -100,7 +100,7 @@ These rules are in namespace `tao::pegtl`.
 * Enables all actions (if any).
 * [Meta data] and [implementation] mapping:
   - `enable<>::rule_t` is `internal::success`
-  - `enable< R >::rule_t` is `internal::enable<, R >`
+  - `enable< R >::rule_t` is `internal::enable< R >`
   - `enable< R >::subs_t` is `type_list< R >`
   - `enable< R... >::rule_t` is `internal::enable< internal::seq< R... > >`
   - `enable< R... >::subs_t` is `type_list< internal::seq< R... > >`
@@ -368,6 +368,8 @@ Note that the `true` template parameter to `internal::if_must` corresponds to th
   - `rematch< R, S... >::rule_t` is `internal::rematch< R, S... >`
   - `rematch< R, S... >::subs_t` is `type_list< R, S... >`
 
+Note that the `S` do *not* need to match *all* of the input matched by `R` (which is why `minus` uses `eof` in its implementation).
+
 ###### `rep< Num, R... >`
 
 * Matches `seq< R... >` for `Num` times without checking for further matches.
diff --git a/packages/PEGTL/include/tao/pegtl/buffer_input.hpp b/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
index 0003442b0a07c0953ded634e27819f0331a14157..60d39d34935d21fe33be102ac7146ff4aa5e3a84 100644
--- a/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
@@ -66,8 +66,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~buffer_input() = default;
 
-      void operator=( const buffer_input& ) = delete;
-      void operator=( buffer_input&& ) = delete;
+      buffer_input& operator=( const buffer_input& ) = delete;
+      buffer_input& operator=( buffer_input&& ) = delete;
 
       [[nodiscard]] bool empty()
       {
@@ -216,6 +216,9 @@ namespace TAO_PEGTL_NAMESPACE
       iterator_t m_current;
       char* m_end;
       const Source m_source;
+
+   public:
+      std::size_t private_depth = 0;
    };
 
 }  // namespace TAO_PEGTL_NAMESPACE
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp b/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
index 586710735f9225d65381cf547af2c56d929bd8ed..0bfbd18c2fd77615ab5ff3eb7d1aec8914221807 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
@@ -44,8 +44,8 @@ namespace TAO_PEGTL_NAMESPACE
 
          ~analyze_cycles_impl() = default;
 
-         void operator=( analyze_cycles_impl&& ) = delete;
-         void operator=( const analyze_cycles_impl& ) = delete;
+         analyze_cycles_impl& operator=( analyze_cycles_impl&& ) = delete;
+         analyze_cycles_impl& operator=( const analyze_cycles_impl& ) = delete;
 
          [[nodiscard]] std::size_t problems()
          {
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp b/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..84333c7d7ab6425ab0b923b88eb68d9046cc5668
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#ifndef TAO_PEGTL_CONTRIB_CHECK_BYTES_HPP
+#define TAO_PEGTL_CONTRIB_CHECK_BYTES_HPP
+
+#include "../apply_mode.hpp"
+#include "../config.hpp"
+#include "../match.hpp"
+#include "../nothing.hpp"
+#include "../rewind_mode.hpp"
+
+#if defined( __cpp_exceptions )
+#include "../parse_error.hpp"
+#else
+#include <cstdio>
+#include <exception>
+#endif
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   template< std::size_t Maximum >
+   struct check_bytes
+      : maybe_nothing
+   {
+      template< typename Rule,
+                pegtl::apply_mode A,
+                pegtl::rewind_mode M,
+                template< typename... >
+                class Action,
+                template< typename... >
+                class Control,
+                typename ParseInput,
+                typename... States >
+      static bool match( ParseInput& in, States&&... st )
+      {
+         const auto* start = in.current();
+         if( TAO_PEGTL_NAMESPACE::match< Rule, A, M, Action, Control >( in, st... ) ) {
+            if( std::size_t( in.current() - start ) > Maximum ) {
+#if defined( __cpp_exceptions )
+               throw TAO_PEGTL_NAMESPACE::parse_error( "maximum allowed rule consumption exceeded", in );
+#else
+               std::fputs( "maximum allowed rule consumption exceeded\n", stderr );
+               std::terminate();
+#endif
+            }
+            return true;
+         }
+         return false;
+      }
+   };
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp b/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
index c04db3f541138c79fe50813a2b6f6e133ffd8251..ebf6e7f1ca1730a47617ccaea29d565270b5a888 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
@@ -26,7 +26,7 @@ namespace TAO_PEGTL_NAMESPACE
                 class Control,
                 typename ParseInput,
                 typename... States >
-      [[nodiscard]] static bool match( ParseInput& in, States&... st )
+      [[nodiscard]] static bool match( ParseInput& in, States&&... st )
       {
          const T t( static_cast< const ParseInput& >( in ), st... );
          return TAO_PEGTL_NAMESPACE::match< Rule, A, M, Action, Control >( in, st... );
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
index 94df74cab2edf16ab5ebed16a5eeb6af4db21873..c0071e959d0c3bd52f405b5800e608107716a91c 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
@@ -12,7 +12,7 @@
 namespace TAO_PEGTL_NAMESPACE::internal
 {
    template< typename... Cs >
-   class set_stack_guard
+   class [[nodiscard]] set_stack_guard
    {
    public:
       template< typename... Ts >
@@ -24,8 +24,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
       set_stack_guard( set_stack_guard&& ) = delete;
       set_stack_guard( const set_stack_guard& ) = delete;
 
-      void operator=( set_stack_guard&& ) = delete;
-      void operator=( const set_stack_guard& ) = delete;
+      set_stack_guard& operator=( set_stack_guard&& ) = delete;
+      set_stack_guard& operator=( const set_stack_guard& ) = delete;
 
       ~set_stack_guard()
       {
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
index 146da0f1afe49f72005fc05389da261d144457f4..4b0cfbe8bf730bdac56b39c6e09cd627b5f928b5 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
@@ -12,7 +12,7 @@
 namespace TAO_PEGTL_NAMESPACE::internal
 {
    template< typename... Cs >
-   class vector_stack_guard
+   class [[nodiscard]] vector_stack_guard
    {
    public:
       template< typename... Ts >
@@ -25,8 +25,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
       vector_stack_guard( vector_stack_guard&& ) = delete;
       vector_stack_guard( const vector_stack_guard& ) = delete;
 
-      void operator=( vector_stack_guard&& ) = delete;
-      void operator=( const vector_stack_guard& ) = delete;
+      vector_stack_guard& operator=( vector_stack_guard&& ) = delete;
+      vector_stack_guard& operator=( const vector_stack_guard& ) = delete;
 
       ~vector_stack_guard()
       {
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp b/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cea99fc6fef85375352e6a377c9db42d19201ff5
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#ifndef TAO_PEGTL_CONTRIB_LIMIT_BYTES_HPP
+#define TAO_PEGTL_CONTRIB_LIMIT_BYTES_HPP
+
+#include <algorithm>
+
+#include "../apply_mode.hpp"
+#include "../config.hpp"
+#include "../match.hpp"
+#include "../nothing.hpp"
+#include "../rewind_mode.hpp"
+
+#if defined( __cpp_exceptions )
+#include "../parse_error.hpp"
+#else
+#include <cstdio>
+#include <exception>
+#endif
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   namespace internal
+   {
+      template< std::size_t Maximum, typename MemoryInput >
+      struct [[nodiscard]] bytes_guard
+      {
+         MemoryInput& m_in;
+         const char* m_end;
+
+         explicit bytes_guard( MemoryInput& in_in ) noexcept
+            : m_in( in_in ),
+              m_end( in_in.end() )
+         {
+            m_in.private_set_end( m_in.begin() + std::min( m_in.size(), Maximum ) );
+         }
+
+         bytes_guard( bytes_guard&& ) = delete;
+         bytes_guard( const bytes_guard& ) = delete;
+
+         ~bytes_guard()
+         {
+            m_in.private_set_end( m_end );
+         }
+
+         bytes_guard& operator=( bytes_guard&& ) = delete;
+         bytes_guard& operator=( const bytes_guard& ) = delete;
+      };
+
+      // C++17 does not allow for partial deduction guides.
+
+   }  // namespace internal
+
+   template< std::size_t Maximum >
+   struct limit_bytes
+      : maybe_nothing
+   {
+      template< typename Rule,
+                apply_mode A,
+                rewind_mode M,
+                template< typename... >
+                class Action,
+                template< typename... >
+                class Control,
+                typename ParseInput,
+                typename... States >
+      [[nodiscard]] static bool match( ParseInput& in, States&&... st )
+      {
+         internal::bytes_guard< Maximum, ParseInput > bg( in );
+         if( TAO_PEGTL_NAMESPACE::match< Rule, A, M, Action, Control >( in, st... ) ) {
+            if( in.empty() && ( bg.m_end != in.current() ) ) {
+#if defined( __cpp_exceptions )
+               throw TAO_PEGTL_NAMESPACE::parse_error( "maximum allowed rule consumption reached", in );
+#else
+               std::fputs( "maximum allowed rule consumption reached\n", stderr );
+               std::terminate();
+#endif
+            }
+            return true;
+         }
+         return false;
+      }
+   };
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp b/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..447bebff7cc21a78f631e37ac23800a2741fd671
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp
@@ -0,0 +1,83 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#ifndef TAO_PEGTL_CONTRIB_LIMIT_DEPTH_HPP
+#define TAO_PEGTL_CONTRIB_LIMIT_DEPTH_HPP
+
+#include "../apply_mode.hpp"
+#include "../config.hpp"
+#include "../match.hpp"
+#include "../nothing.hpp"
+#include "../rewind_mode.hpp"
+
+#if defined( __cpp_exceptions )
+#include "../parse_error.hpp"
+#else
+#include <cstdio>
+#include <exception>
+#endif
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   namespace internal
+   {
+      struct [[nodiscard]] depth_guard
+      {
+         std::size_t& m_depth;
+
+         explicit depth_guard( std::size_t& depth ) noexcept
+            : m_depth( depth )
+         {
+            ++m_depth;
+         }
+
+         depth_guard( depth_guard&& ) = delete;
+         depth_guard( const depth_guard& ) = delete;
+
+         ~depth_guard()
+         {
+            --m_depth;
+         }
+
+         depth_guard& operator=( depth_guard&& ) = delete;
+         depth_guard& operator=( const depth_guard& ) = delete;
+      };
+
+   }  // namespace internal
+
+   template< std::size_t Maximum >
+   struct limit_depth
+      : maybe_nothing
+   {
+      template< typename Rule,
+                apply_mode A,
+                rewind_mode M,
+                template< typename... >
+                class Action,
+                template< typename... >
+                class Control,
+                typename ParseInput,
+                typename... States >
+      [[nodiscard]] static bool match( ParseInput& in, States&&... st )
+      {
+         if constexpr( Control< Rule >::enable ) {
+            const internal::depth_guard dg( in.private_depth );
+            if( in.private_depth > Maximum ) {
+#if defined( __cpp_exceptions )
+               throw TAO_PEGTL_NAMESPACE::parse_error( "maximum parser rule nesting depth exceeded", in );
+#else
+               std::fputs( "maximum parser rule nesting depth exceeded\n", stderr );
+               std::terminate();
+#endif
+            }
+            return TAO_PEGTL_NAMESPACE::match< Rule, A, M, Action, Control >( in, st... );
+         }
+         else {
+            return TAO_PEGTL_NAMESPACE::match< Rule, A, M, Action, Control >( in, st... );
+         }
+      }
+   };
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/skip.hpp b/packages/PEGTL/include/tao/pegtl/contrib/skip.hpp
deleted file mode 100644
index 8aea252e83710abc24afb6319d73a67da4c9fca4..0000000000000000000000000000000000000000
--- a/packages/PEGTL/include/tao/pegtl/contrib/skip.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
-
-#ifndef TAO_PEGTL_CONTRIB_SKIP_HPP
-#define TAO_PEGTL_CONTRIB_SKIP_HPP
-
-#include "../apply_mode.hpp"
-#include "../config.hpp"
-#include "../match.hpp"
-#include "../normal.hpp"
-#include "../nothing.hpp"
-#include "../rewind_mode.hpp"
-
-namespace TAO_PEGTL_NAMESPACE
-{
-   // this is currently experimental and may change at any time
-   template< typename How, typename Where, template< typename... > class Base = normal >
-   struct skip
-   {
-      template< typename Rule >
-      struct control
-         : Base< Rule >
-      {
-         template< apply_mode A,
-                   rewind_mode M,
-                   template< typename... >
-                   class Action,
-                   template< typename... >
-                   class Control,
-                   typename ParseInput,
-                   typename... States >
-         [[nodiscard]] static bool match( ParseInput& in, States&&... st )
-         {
-            // TODO: if we only skip after but not before the actual rule,
-            // we would not need this marker.
-            auto m = in.template mark< M >();
-
-            // TODO: different conditions for before/after skipping?
-            if( Where::template value< Rule > ) {
-               // TODO: assert on result to be successful?
-               (void)TAO_PEGTL_NAMESPACE::match< How, apply_mode::nothing, M, nothing, normal >( in );
-            }
-
-            const bool result = Base< Rule >::template match< A, M, Action, Control >( in, st... );
-
-            // TODO: different conditions for before/after skipping?
-            if( result && Where::template value< Rule > ) {
-               // TODO: assert on result to be successful?
-               (void)TAO_PEGTL_NAMESPACE::match< How, apply_mode::nothing, M, nothing, normal >( in );
-            }
-
-            return m( result );
-         }
-      };
-   };
-
-}  // namespace TAO_PEGTL_NAMESPACE
-
-#endif
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
index ad8147a7dbf0d47ee8c3597735d78000d8c2b6b0..0c60e3a4a6a85eb28e419826072e36e6ffb62cc5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
@@ -38,8 +38,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          ::close( m_fd );
       }
 
-      void operator=( const file_opener& ) = delete;
-      void operator=( file_opener&& ) = delete;
+      file_opener& operator=( const file_opener& ) = delete;
+      file_opener& operator=( file_opener&& ) = delete;
 
       [[nodiscard]] std::size_t size() const
       {
@@ -118,8 +118,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          ::munmap( const_cast< char* >( m_data ), m_size );
       }
 
-      void operator=( const file_mapper& ) = delete;
-      void operator=( file_mapper&& ) = delete;
+      file_mapper& operator=( const file_mapper& ) = delete;
+      file_mapper& operator=( file_mapper&& ) = delete;
 
       [[nodiscard]] bool empty() const noexcept
       {
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
index a8a471fd9ed881c2f36ca09517d07ec971517735..73c27dec90556f0b091ed191a28ca6c4557e3d95 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
@@ -52,8 +52,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          ::CloseHandle( m_handle );
       }
 
-      void operator=( const file_opener& ) = delete;
-      void operator=( file_opener&& ) = delete;
+      file_opener& operator=( const file_opener& ) = delete;
+      file_opener& operator=( file_opener&& ) = delete;
 
       [[nodiscard]] std::size_t size() const
       {
@@ -134,8 +134,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          ::CloseHandle( m_handle );
       }
 
-      void operator=( const win32_file_mapper& ) = delete;
-      void operator=( win32_file_mapper&& ) = delete;
+      win32_file_mapper& operator=( const win32_file_mapper& ) = delete;
+      win32_file_mapper& operator=( win32_file_mapper&& ) = delete;
 
       const size_t m_size;
       const HANDLE m_handle;
@@ -202,8 +202,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          ::UnmapViewOfFile( LPCVOID( m_data ) );
       }
 
-      void operator=( const file_mapper& ) = delete;
-      void operator=( file_mapper&& ) = delete;
+      file_mapper& operator=( const file_mapper& ) = delete;
+      file_mapper& operator=( file_mapper&& ) = delete;
 
       [[nodiscard]] bool empty() const noexcept
       {
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
index 7b846e1bf246cec4086d6333a9021f810928e865..7c2b054942877d96d308ea4be0a27e49e1917f00 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
@@ -78,8 +78,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
 
       ~file_reader() = default;
 
-      void operator=( const file_reader& ) = delete;
-      void operator=( file_reader&& ) = delete;
+      file_reader& operator=( const file_reader& ) = delete;
+      file_reader& operator=( file_reader&& ) = delete;
 
       [[nodiscard]] std::size_t size() const
       {
diff --git a/packages/PEGTL/include/tao/pegtl/internal/marker.hpp b/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
index 4beb9293a17f18e6d9eac6370ead5d0a53866746..a9ffa824f356e85addebb0c99ddab3373c62b3df 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
@@ -10,7 +10,7 @@
 namespace TAO_PEGTL_NAMESPACE::internal
 {
    template< typename Iterator, rewind_mode M >
-   class marker
+   class [[nodiscard]] marker
    {
    public:
       static constexpr rewind_mode next_rewind_mode = M;
@@ -23,8 +23,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
 
       ~marker() = default;
 
-      void operator=( const marker& ) = delete;
-      void operator=( marker&& ) = delete;
+      marker& operator=( const marker& ) = delete;
+      marker& operator=( marker&& ) = delete;
 
       [[nodiscard]] bool operator()( const bool result ) const noexcept
       {
@@ -33,7 +33,7 @@ namespace TAO_PEGTL_NAMESPACE::internal
    };
 
    template< typename Iterator >
-   class marker< Iterator, rewind_mode::required >
+   class [[nodiscard]] marker< Iterator, rewind_mode::required >
    {
    public:
       static constexpr rewind_mode next_rewind_mode = rewind_mode::active;
@@ -53,8 +53,8 @@ namespace TAO_PEGTL_NAMESPACE::internal
          }
       }
 
-      void operator=( const marker& ) = delete;
-      void operator=( marker&& ) = delete;
+      marker& operator=( const marker& ) = delete;
+      marker& operator=( marker&& ) = delete;
 
       [[nodiscard]] bool operator()( const bool result ) noexcept
       {
diff --git a/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp b/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
index 1af27d7552ac0df06d72b882e47ff35a3a1e1c97..bf57db611e00058ae681096676cb5893efe7b355 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
@@ -6,6 +6,8 @@
 
 #include "../config.hpp"
 
+#include <utility>
+
 #include "bump_help.hpp"
 #include "enable_control.hpp"
 #include "failure.hpp"
@@ -16,37 +18,12 @@
 
 namespace TAO_PEGTL_NAMESPACE::internal
 {
-   template< typename Char, Char... Cs >
-   struct ranges_impl;
-
-   template< typename Char >
-   struct ranges_impl< Char >
-   {
-      [[nodiscard]] static constexpr bool test( const Char /*unused*/ ) noexcept
-      {
-         return false;
-      }
-   };
-
-   template< typename Char, Char Eq >
-   struct ranges_impl< Char, Eq >
-   {
-      [[nodiscard]] static constexpr bool test( const Char c ) noexcept
-      {
-         return c == Eq;
-      }
-   };
-
-   template< typename Char, Char Lo, Char Hi, Char... Cs >
-   struct ranges_impl< Char, Lo, Hi, Cs... >
+   template< typename Char, Char Lo, Char Hi >
+   constexpr bool validate_range( Char c ) noexcept
    {
-      static_assert( Lo <= Hi, "invalid range detected" );
-
-      [[nodiscard]] static constexpr bool test( const Char c ) noexcept
-      {
-         return ( ( Lo <= c ) && ( c <= Hi ) ) || ranges_impl< Char, Cs... >::test( c );
-      }
-   };
+      static_assert( Lo <= Hi, "invalid range" );
+      return ( Lo <= c ) && ( c <= Hi );
+   }
 
    template< typename Peek, typename Peek::data_t... Cs >
    struct ranges
@@ -57,9 +34,21 @@ namespace TAO_PEGTL_NAMESPACE::internal
       using rule_t = ranges;
       using subs_t = empty_list;
 
+      template< std::size_t... Is >
+      [[nodiscard]] static constexpr bool test( std::index_sequence< Is... > /*unused*/, const data_t c ) noexcept
+      {
+         constexpr const data_t cs[] = { Cs... };
+         if constexpr( sizeof...( Cs ) % 2 == 0 ) {
+            return ( validate_range< data_t, cs[ 2 * Is ], cs[ 2 * Is + 1 ] >( c ) || ... );
+         }
+         else {
+            return ( validate_range< data_t, cs[ 2 * Is ], cs[ 2 * Is + 1 ] >( c ) || ... ) || ( c == cs[ sizeof...( Cs ) - 1 ] );
+         }
+      }
+
       [[nodiscard]] static constexpr bool test( const data_t c ) noexcept
       {
-         return ranges_impl< data_t, Cs... >::test( c );
+         return test( std::make_index_sequence< sizeof...( Cs ) / 2 >(), c );
       }
 
       template< int Eol >
diff --git a/packages/PEGTL/include/tao/pegtl/memory_input.hpp b/packages/PEGTL/include/tao/pegtl/memory_input.hpp
index 675e8b32fdee7203df98f20622ca50f934fc7137..7cdc9f9f88aa5a2adbaf3158730217d3a2176334 100644
--- a/packages/PEGTL/include/tao/pegtl/memory_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/memory_input.hpp
@@ -62,8 +62,8 @@ namespace TAO_PEGTL_NAMESPACE
 
          ~memory_input_base() = default;
 
-         memory_input_base operator=( const memory_input_base& ) = delete;
-         memory_input_base operator=( memory_input_base&& ) = delete;
+         memory_input_base& operator=( const memory_input_base& ) = delete;
+         memory_input_base& operator=( memory_input_base&& ) = delete;
 
          [[nodiscard]] const char* current() const noexcept
          {
@@ -124,13 +124,17 @@ namespace TAO_PEGTL_NAMESPACE
             m_current.byte = in_byte;
             m_current.line = in_line;
             m_current.column = in_column;
+            private_depth = 0;
          }
 
       protected:
          const char* const m_begin;
          iterator_t m_current;
-         const char* const m_end;
+         const char* m_end;
          const Source m_source;
+
+      public:
+         std::size_t private_depth = 0;
       };
 
       template< typename Eol, typename Source >
@@ -160,8 +164,8 @@ namespace TAO_PEGTL_NAMESPACE
 
          ~memory_input_base() = default;
 
-         memory_input_base operator=( const memory_input_base& ) = delete;
-         memory_input_base operator=( memory_input_base&& ) = delete;
+         memory_input_base& operator=( const memory_input_base& ) = delete;
+         memory_input_base& operator=( memory_input_base&& ) = delete;
 
          [[nodiscard]] const char* current() const noexcept
          {
@@ -208,13 +212,17 @@ namespace TAO_PEGTL_NAMESPACE
          void restart()
          {
             m_current = m_begin.data;
+            private_depth = 0;
          }
 
       protected:
          const internal::iterator m_begin;
          iterator_t m_current;
-         const char* const m_end;
+         const char* m_end;
          const Source m_source;
+
+      public:
+         std::size_t private_depth = 0;
       };
 
    }  // namespace internal
@@ -268,8 +276,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~memory_input() = default;
 
-      memory_input operator=( const memory_input& ) = delete;
-      memory_input operator=( memory_input&& ) = delete;
+      memory_input& operator=( const memory_input& ) = delete;
+      memory_input& operator=( memory_input&& ) = delete;
 
       [[nodiscard]] const Source& source() const noexcept
       {
@@ -355,6 +363,11 @@ namespace TAO_PEGTL_NAMESPACE
          const char* b = begin_of_line( p );
          return std::string_view( b, static_cast< std::size_t >( end_of_line( p ) - b ) );
       }
+
+      void private_set_end( const char* new_end ) noexcept
+      {
+         this->m_end = new_end;
+      }
    };
 
    template< typename... Ts >
diff --git a/packages/PEGTL/include/tao/pegtl/mmap_input.hpp b/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
index 1bd4dce191141af073cf5a0d7eba3c00298e9334..2362b6c765747a6ccbe7964417434f32fe2bf04f 100644
--- a/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
@@ -42,8 +42,8 @@ namespace TAO_PEGTL_NAMESPACE
 
          ~mmap_holder() = default;
 
-         void operator=( const mmap_holder& ) = delete;
-         void operator=( mmap_holder&& ) = delete;
+         mmap_holder& operator=( const mmap_holder& ) = delete;
+         mmap_holder& operator=( mmap_holder&& ) = delete;
       };
 
    }  // namespace internal
@@ -67,8 +67,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~mmap_input() = default;
 
-      void operator=( const mmap_input& ) = delete;
-      void operator=( mmap_input&& ) = delete;
+      mmap_input& operator=( const mmap_input& ) = delete;
+      mmap_input& operator=( mmap_input&& ) = delete;
    };
 
    template< typename... Ts >
diff --git a/packages/PEGTL/include/tao/pegtl/parse.hpp b/packages/PEGTL/include/tao/pegtl/parse.hpp
index 9634655c56791406a8d93101514335dc5b044312..3888fb3fc3ed619423ad4a3af5abaa2510fa9e3c 100644
--- a/packages/PEGTL/include/tao/pegtl/parse.hpp
+++ b/packages/PEGTL/include/tao/pegtl/parse.hpp
@@ -4,7 +4,7 @@
 #ifndef TAO_PEGTL_PARSE_HPP
 #define TAO_PEGTL_PARSE_HPP
 
-#include <utility>
+#include <type_traits>
 
 #include "apply_mode.hpp"
 #include "config.hpp"
@@ -16,17 +16,20 @@
 
 namespace TAO_PEGTL_NAMESPACE
 {
-   template< typename Rule,
-             template< typename... > class Action = nothing,
-             template< typename... > class Control = normal,
-             apply_mode A = apply_mode::action,
-             rewind_mode M = rewind_mode::required,
-             typename ParseInput,
-             typename... States >
-   auto parse( ParseInput&& in, States&&... st )
+   namespace internal
    {
-      return Control< Rule >::template match< A, M, Action, Control >( in, st... );
-   }
+      [[nodiscard]] inline auto get_position( const position& p ) noexcept( std::is_nothrow_copy_constructible_v< position > )
+      {
+         return p;
+      }
+
+      template< typename ParseInput >
+      [[nodiscard]] position get_position( const ParseInput& in ) noexcept( noexcept( position( in.position() ) ) )
+      {
+         return in.position();
+      }
+
+   }  // namespace internal
 
    template< typename Rule,
              template< typename... > class Action = nothing,
@@ -35,46 +38,31 @@ namespace TAO_PEGTL_NAMESPACE
              rewind_mode M = rewind_mode::required,
              typename ParseInput,
              typename... States >
-   auto parse_nested( position op, ParseInput&& in, States&&... st )
+   auto parse( ParseInput&& in, States&&... st )
    {
-#if defined( __cpp_exceptions )
-      try {
-         return parse< Rule, Action, Control, A, M >( in, st... );
-      }
-      catch( parse_error& e ) {
-         e.add_position( std::move( op ) );
-         throw;
-      }
-#else
-      (void)op;
-      return parse< Rule, Action, Control, A, M >( in, st... );
-#endif
+      return Control< Rule >::template match< A, M, Action, Control >( in, st... );
    }
 
-   // NOTE: The oi.position() in the version below can be expensive for lazy
-   // inputs, which is why the version below does not simply call the version
-   // above with said oi.position() as first parameter.
-
    template< typename Rule,
              template< typename... > class Action = nothing,
              template< typename... > class Control = normal,
              apply_mode A = apply_mode::action,
              rewind_mode M = rewind_mode::required,
-             typename OuterInput,
+             typename Outer,
              typename ParseInput,
              typename... States >
-   auto parse_nested( const OuterInput& oi, ParseInput&& in, States&&... st )
+   auto parse_nested( const Outer& o, ParseInput&& in, States&&... st )
    {
 #if defined( __cpp_exceptions )
       try {
          return parse< Rule, Action, Control, A, M >( in, st... );
       }
       catch( parse_error& e ) {
-         e.add_position( oi.position() );
+         e.add_position( internal::get_position( o ) );
          throw;
       }
 #else
-      (void)oi;
+      (void)o;
       return parse< Rule, Action, Control, A, M >( in, st... );
 #endif
    }
diff --git a/packages/PEGTL/include/tao/pegtl/parse_error.hpp b/packages/PEGTL/include/tao/pegtl/parse_error.hpp
index f471fb4205a64f3fd1e228af750e65f5c7677638..df62ce6811129c929d2bd2edec924c08fa24fc8a 100644
--- a/packages/PEGTL/include/tao/pegtl/parse_error.hpp
+++ b/packages/PEGTL/include/tao/pegtl/parse_error.hpp
@@ -107,6 +107,11 @@ namespace TAO_PEGTL_NAMESPACE
          }
          m_impl->add_position( std::move( p ) );
       }
+
+      void add_position( const position& p )
+      {
+         add_position( position( p ) );
+      }
    };
 
 }  // namespace TAO_PEGTL_NAMESPACE
diff --git a/packages/PEGTL/include/tao/pegtl/read_input.hpp b/packages/PEGTL/include/tao/pegtl/read_input.hpp
index 7b748d0930c3feabcd4ac64f60d2d2df47b78c33..019c9cd20b806f28772825a1bd9fa319b4c818a8 100644
--- a/packages/PEGTL/include/tao/pegtl/read_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/read_input.hpp
@@ -42,8 +42,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~read_input() = default;
 
-      void operator=( const read_input& ) = delete;
-      void operator=( read_input&& ) = delete;
+      read_input& operator=( const read_input& ) = delete;
+      read_input& operator=( read_input&& ) = delete;
    };
 
    template< typename... Ts >
diff --git a/packages/PEGTL/include/tao/pegtl/string_input.hpp b/packages/PEGTL/include/tao/pegtl/string_input.hpp
index 2d83a6334e887b7b429aa420046abb3dbb488f8b..43349eb6f0e76480ff879cba04eb79eecda54152 100644
--- a/packages/PEGTL/include/tao/pegtl/string_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/string_input.hpp
@@ -30,8 +30,8 @@ namespace TAO_PEGTL_NAMESPACE
 
          ~string_holder() = default;
 
-         void operator=( const string_holder& ) = delete;
-         void operator=( string_holder&& ) = delete;
+         string_holder& operator=( const string_holder& ) = delete;
+         string_holder& operator=( string_holder&& ) = delete;
       };
 
    }  // namespace internal
@@ -52,8 +52,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~string_input() = default;
 
-      void operator=( const string_input& ) = delete;
-      void operator=( string_input&& ) = delete;
+      string_input& operator=( const string_input& ) = delete;
+      string_input& operator=( string_input&& ) = delete;
    };
 
    template< typename... Ts >
diff --git a/packages/PEGTL/include/tao/pegtl/version.hpp b/packages/PEGTL/include/tao/pegtl/version.hpp
index 4d460c536ae80908476227a9aef03d48929d3bea..2ffc6b953b1869b690588655bf26281f84fb68d5 100644
--- a/packages/PEGTL/include/tao/pegtl/version.hpp
+++ b/packages/PEGTL/include/tao/pegtl/version.hpp
@@ -4,10 +4,10 @@
 #ifndef TAO_PEGTL_VERSION_HPP
 #define TAO_PEGTL_VERSION_HPP
 
-#define TAO_PEGTL_VERSION "3.2.0"
+#define TAO_PEGTL_VERSION "3.2.1"
 
 #define TAO_PEGTL_VERSION_MAJOR 3
 #define TAO_PEGTL_VERSION_MINOR 2
-#define TAO_PEGTL_VERSION_PATCH 0
+#define TAO_PEGTL_VERSION_PATCH 1
 
 #endif
diff --git a/packages/PEGTL/src/example/pegtl/CMakeLists.txt b/packages/PEGTL/src/example/pegtl/CMakeLists.txt
index 46ffeb0685ed9dc45b9022c90460bb68e0fcc00c..64bb7380dfb645e72c027fb5c7cc511f62ee03a3 100644
--- a/packages/PEGTL/src/example/pegtl/CMakeLists.txt
+++ b/packages/PEGTL/src/example/pegtl/CMakeLists.txt
@@ -8,6 +8,7 @@ set(example_sources
   csv1.cpp
   csv2.cpp
   dynamic_match.cpp
+  expression.cpp
   hello_world.cpp
   indent_aware.cpp
   json_analyze.cpp
@@ -29,7 +30,6 @@ set(example_sources
   peg2pegtl.cpp
   proto3.cpp
   recover.cpp
-  skipper.cpp
   s_expression.cpp
   sum.cpp
   symbol_table.cpp
diff --git a/packages/PEGTL/src/example/pegtl/expression.cpp b/packages/PEGTL/src/example/pegtl/expression.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d490f93f70fa150298f700ad48a416d97842ab6
--- /dev/null
+++ b/packages/PEGTL/src/example/pegtl/expression.cpp
@@ -0,0 +1,613 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#if !defined( __cpp_exceptions )
+#include <iostream>
+int main()
+{
+   std::cerr << "Exception support required, example unavailable." << std::endl;
+   return 1;
+}
+#else
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <stdexcept>
+#include <tuple>
+#include <variant>
+#include <vector>
+
+#include <tao/pegtl.hpp>
+
+namespace TAO_PEGTL_NAMESPACE::expression
+{
+   // Expression parsing with prefix, postfix and infix operators, ternary
+   // operator and a couple of other special cases supported.
+
+   // The handling of operator precedences with left and right binding power is
+   // based on https://github.com/matklad/minipratt/blob/master/src/bin/pratt.rs
+
+   // It correctly recognises all operators with their precedence and associativity,
+   // however is still very much work-in-progress regarding a lot of details...
+
+   // TODO: Fix missing whitespace-skip before infix/postfix operators.
+   // TODO: Decide whether to use must everywhere or nowhere?
+   // TODO: Decide whether to suppress actions for sub-rules.
+   // TODO: Finalise the event-style interface or change to fake actions or actions with ops?
+   // TODO: Decide on where to use config vs. where to use grammar template parameters.
+   // TODO: Choose customisation points vs. copy-n-paste customisation.
+   // TODO: Constexpr-ify where possible with C++20.
+
+   namespace internal
+   {
+      struct prefix_info
+      {
+         prefix_info( const std::string_view n, const std::uint8_t pbp ) noexcept
+            : name( n ),
+              prefix_binding_power( pbp )
+         {
+            assert( pbp );
+         }
+
+         std::string name;
+
+         std::uint8_t prefix_binding_power;
+      };
+
+      struct infix_postfix_info
+      {
+         infix_postfix_info( const std::string_view n, const std::uint8_t lbp, const std::uint8_t rbp = 0 ) noexcept
+            : infix_postfix_info( n, std::string_view(), lbp, rbp )
+         {}
+
+         infix_postfix_info( const std::string_view n, const std::string_view o, const std::uint8_t lbp, const std::uint8_t rbp = 0 ) noexcept
+            : name( n ),
+              other( o ),
+              left_binding_power( lbp ),
+              right_binding_power( rbp )
+         {
+            if( right_binding_power > 0 ) {
+               assert( std::min( left_binding_power, right_binding_power ) & 1 );
+               assert( 2 * std::min( left_binding_power, right_binding_power ) + 1 == left_binding_power + right_binding_power );
+            }
+            assert( left_binding_power > 0 );
+         }
+
+         [[nodiscard]] bool is_infix() const noexcept
+         {
+            return right_binding_power != 0;
+         }
+
+         [[nodiscard]] bool is_postfix() const noexcept
+         {
+            return right_binding_power == 0;
+         }
+
+         std::string name;
+         std::string other;  // Used for the ':' of the ternary operator etc.
+
+         std::uint8_t left_binding_power;
+         std::uint8_t right_binding_power;
+      };
+
+      template< typename ParseInput >
+      [[nodiscard]] bool match_string_view( ParseInput& in, const std::string_view sv )
+      {
+         if( in.size( sv.size() ) >= sv.size() ) {
+            if( std::memcmp( in.current(), sv.data(), sv.size() ) == 0 ) {
+               in.bump( sv.size() );
+               return true;
+            }
+         }
+         return false;
+      }
+
+      template< typename ParseInput, typename OperatorInfo >
+      [[nodiscard]] const OperatorInfo* match_prefix( ParseInput& in, const std::size_t max_length, const std::vector< OperatorInfo >& ops )
+      {
+         const std::size_t max = std::min( max_length, in.size( max_length ) );
+         for( std::string op( in.current(), max ); !op.empty(); op.pop_back() ) {
+            if( const auto i = std::find_if( ops.begin(), ops.end(), [ = ]( const OperatorInfo& info ) { return info.name == op; } ); i != ops.end() ) {
+               in.bump( op.size() );
+               return &*i;
+            }
+         }
+         return nullptr;
+      }
+
+      template< typename ParseInput, typename OperatorInfo >
+      [[nodiscard]] const OperatorInfo* match_infix_postfix( ParseInput& in, const std::size_t max_length, const std::vector< OperatorInfo >& ops, const std::uint8_t min_precedence )
+      {
+         const std::size_t max = std::min( max_length, in.size( max_length ) );
+         for( std::string op( in.current(), max ); !op.empty(); op.pop_back() ) {
+            if( const auto i = std::find_if( ops.begin(), ops.end(), [ = ]( const OperatorInfo& info ) { return info.name == op; } ); ( i != ops.end() ) && ( i->left_binding_power >= min_precedence ) ) {
+               in.bump( op.size() );
+               return &*i;
+            }
+         }
+         return nullptr;
+      }
+
+      template< typename T >
+      [[nodiscard]] std::vector< T > sorted_operator_vector( const std::initializer_list< T >& t )
+      {
+         std::vector< T > v{ t };
+         const auto less = []( const auto& l, const auto& r ) { return l.name < r.name; };
+         std::sort( v.begin(), v.end(), less );
+         return v;
+      }
+
+      struct operator_maps
+      {
+         // clang-format off
+         operator_maps()
+            : prefix( sorted_operator_vector( {
+                  prefix_info( "!", 80 ),
+                  prefix_info( "+", 80 ),
+                  prefix_info( "-", 80 ),
+                  prefix_info( "~", 80 ),
+                  prefix_info( "*", 80 ),
+                  prefix_info( "&", 80 ),
+                  prefix_info( "++", 80 ),
+                  prefix_info( "--", 80 )
+               } ) ),
+              infix_postfix( sorted_operator_vector( {
+                  infix_postfix_info( "::", 99, 100 ),  // Special: Followed by identifier (or template-space-identifer, which we don't support yet).
+                  infix_postfix_info( ".*", 37, 38 ),
+                  infix_postfix_info( "->*", 37, 38 ),
+                  infix_postfix_info( "*", 35, 36 ),
+                  infix_postfix_info( "/", 35, 36 ),
+                  infix_postfix_info( "%", 35, 36 ),
+                  infix_postfix_info( "+", 33, 34 ),
+                  infix_postfix_info( "-", 33, 34 ),
+                  infix_postfix_info( "<<", 31, 32 ),
+                  infix_postfix_info( ">>", 31, 32 ),
+                  infix_postfix_info( "<=>", 29, 30 ),
+                  infix_postfix_info( "<", 27, 28 ),
+                  infix_postfix_info( "<=", 27, 28 ),
+                  infix_postfix_info( ">", 27, 28 ),
+                  infix_postfix_info( ">=", 27, 28 ),
+                  infix_postfix_info( "==", 25, 26 ),
+                  infix_postfix_info( "!=", 25, 26 ),
+                  infix_postfix_info( "&", 23, 24 ),
+                  infix_postfix_info( "^", 21, 22 ),
+                  infix_postfix_info( "|", 19, 20 ),
+                  infix_postfix_info( "&&", 17, 18 ),
+                  infix_postfix_info( "||", 15, 16 ),
+                  infix_postfix_info( "?", ":", 14, 13 ),  // Special: Ternary operator.
+                  infix_postfix_info( "=", 12, 11 ),
+                  infix_postfix_info( "+=", 12, 11 ),
+                  infix_postfix_info( "-=", 12, 11 ),
+                  infix_postfix_info( "*=", 12, 11 ),
+                  infix_postfix_info( "/=", 12, 11 ),
+                  infix_postfix_info( "%=", 12, 11 ),
+                  infix_postfix_info( "<<=", 12, 11 ),
+                  infix_postfix_info( ">>=", 12, 11 ),
+                  infix_postfix_info( "&=", 12, 11 ),
+                  infix_postfix_info( "^=", 12, 11 ),
+                  infix_postfix_info( "|=", 12, 11 ),
+                  // infix_postfix_info( ",", 9, 10 ),  // TODO: Enable, but forbid in function argument list.
+                  infix_postfix_info( "[", "]", 90 ),  // Special: Argument list.
+                  infix_postfix_info( "(", ")", 90 ),  // Special: Argument list.
+                  infix_postfix_info( ".", 90 ),  // Special: Followed by identifier.
+                  infix_postfix_info( "->", 90 ),  // Special: Followed by identifier.
+                  infix_postfix_info( "++", 90 ),
+                  infix_postfix_info( "--", 90 )
+               } ) ),
+              max_prefix_length( std::max_element( prefix.begin(), prefix.end(), []( const auto& l, const auto& r ) { return l.name.size() < r.name.size(); } )->name.size() ),
+              max_infix_postfix_length( std::max_element( infix_postfix.begin(), infix_postfix.end(), []( const auto& l, const auto& r ) { return l.name.size() < r.name.size(); } )->name.size() )
+         {
+            // These are C++20 operators with the correct associativity and relative precedence, however some are still missing:
+            // TODO: Compound literal (C99), _Alignof (C11), Functional cast, sizeof, co_await, co_yield, throw, new, new[], delete, delete[], C-style casts.
+         }
+         // clang-format on
+
+         const std::vector< prefix_info > prefix;
+         const std::vector< infix_postfix_info > infix_postfix;
+
+         const std::size_t max_prefix_length;
+         const std::size_t max_infix_postfix_length;
+      };
+
+      struct string_view_rule
+      {
+         template< apply_mode A,
+                   rewind_mode M,
+                   template< typename... >
+                   class Action,
+                   template< typename... >
+                   class Control,
+                   typename ParseInput >
+         [[nodiscard]] static bool match( ParseInput& in, const std::string_view sv ) noexcept( noexcept( match_string_view( in, sv ) ) )
+         {
+            return match_string_view( in, sv );
+         }
+      };
+
+      struct comment
+         : seq< one< '#' >, until< eolf > >
+      {};
+
+      struct ignored
+         : sor< space, comment >
+      {};
+
+      template< typename Literal, typename Identifier >
+      struct expression;
+
+      template< typename Literal, typename Identifier >
+      struct bracket_expression
+      {
+         template< apply_mode A,
+                   rewind_mode M,
+                   template< typename... >
+                   class Action,
+                   template< typename... >
+                   class Control,
+                   typename ParseInput,
+                   typename Result,
+                   typename Config >
+         [[nodiscard]] static bool match( ParseInput& in, Result& res, const Config& cfg, const std::uint8_t /*unused*/ )
+         {
+            return Control< if_must< one< '(' >, star< ignored >, expression< Literal, Identifier >, star< ignored >, one< ')' > > >::template match< A, M, Action, Control >( in, res, cfg, 0 );
+         }
+      };
+
+      template< typename Literal, typename Identifier >
+      struct prefix_expression
+      {
+         template< apply_mode A,
+                   rewind_mode M,
+                   template< typename... >
+                   class Action,
+                   template< typename... >
+                   class Control,
+                   typename ParseInput,
+                   typename Result,
+                   typename Config >
+         [[nodiscard]] static bool match( ParseInput& in, Result& res, const Config& cfg, const std::uint8_t /*unused*/ )
+         {
+            if( const auto* info = match_prefix( in, cfg.max_prefix_length, cfg.prefix ) ) {
+               (void)Control< must< star< ignored >, expression< Literal, Identifier > > >::template match< A, M, Action, Control >( in, res, cfg, info->prefix_binding_power );
+               if constexpr( A == apply_mode::action ) {
+                  res.prefix( info->name );
+               }
+               return true;
+            }
+            return false;
+         }
+      };
+
+      template< typename Literal, typename Identifier >
+      struct infix_postfix_expression
+      {
+         template< apply_mode A,
+                   rewind_mode M,
+                   template< typename... >
+                   class Action,
+                   template< typename... >
+                   class Control,
+                   typename ParseInput,
+                   typename Result,
+                   typename Config >
+         [[nodiscard]] static bool match( ParseInput& in, Result& res, const Config& cfg, const std::uint8_t min )
+         {
+            if( const auto* info = match_infix_postfix( in, cfg.max_infix_postfix_length, cfg.infix_postfix, min ) ) {
+               if( info->name == "?" ) {
+                  (void)Control< must< star< ignored >, expression< Literal, Identifier > > >::template match< A, M, Action, Control >( in, res, cfg, 0 );
+                  (void)Control< must< star< ignored >, string_view_rule > >::template match< A, M, Action, Control >( in, info->other );
+                  (void)Control< must< star< ignored >, expression< Literal, Identifier > > >::template match< A, M, Action, Control >( in, res, cfg, info->right_binding_power );
+                  if constexpr( A == apply_mode::action ) {
+                     res.ternary( info->name, info->other );
+                  }
+                  return true;
+               }
+               if( ( info->name == "." ) || ( info->name == "::" ) || ( info->name == "->" ) ) {
+                  (void)Control< must< star< ignored >, Identifier > >::template match< A, M, Action, Control >( in, res, cfg, 0 );
+                  if constexpr( A == apply_mode::action ) {
+                     res.infix( info->name );
+                  }
+                  return true;
+               }
+               if( ( info->name == "(" ) || ( info->name == "[" ) ) {
+                  const std::size_t size = res.term_stack.size();  // TODO: Determine number of arguments without relying on res!?
+                  (void)Control< must< star< ignored >, opt< list_must< expression< Literal, Identifier >, one< ',' >, ignored > > > >::template match< A, M, Action, Control >( in, res, cfg, 0 );
+                  (void)Control< must< star< ignored >, string_view_rule > >::template match< A, M, Action, Control >( in, info->other );
+                  if constexpr( A == apply_mode::action ) {
+                     res.call( info->name, info->other, res.term_stack.size() - size );
+                  }
+                  return true;
+               }
+               if( info->is_infix() ) {
+                  (void)Control< must< star< ignored >, expression< Literal, Identifier > > >::template match< A, M, Action, Control >( in, res, cfg, info->right_binding_power );
+                  if constexpr( A == apply_mode::action ) {
+                     res.infix( info->name );
+                  }
+                  return true;
+               }
+               if( info->is_postfix() ) {
+                  if constexpr( A == apply_mode::action ) {
+                     res.postfix( info->name );
+                  }
+                  return true;
+               }
+            }
+            return false;
+         }
+      };
+
+      template< typename Literal, typename Identifier >
+      struct first_expression
+         : sor< Literal, Identifier, bracket_expression< Literal, Identifier >, prefix_expression< Literal, Identifier > >
+      {};
+
+      template< typename Literal, typename Identifier >
+      struct expression
+         : seq< first_expression< Literal, Identifier >, star< infix_postfix_expression< Literal, Identifier > > >
+      {};
+
+   }  // namespace internal
+
+   template< typename Literal, typename Identifier >
+   struct grammar
+   {
+      using rule_t = grammar;
+      using subs_t = type_list< internal::expression< Literal, Identifier > >;
+
+      template< apply_mode A,
+                rewind_mode M,
+                template< typename... >
+                class Action,
+                template< typename... >
+                class Control,
+                typename ParseInput,
+                typename Result >
+      [[nodiscard]] static bool match( ParseInput& in, Result& res )
+      {
+         const internal::operator_maps cfg;
+         return match< A, M, Action, Control >( in, res, cfg );
+      }
+
+      template< apply_mode A,
+                rewind_mode M,
+                template< typename... >
+                class Action,
+                template< typename... >
+                class Control,
+                typename ParseInput,
+                typename Result,
+                typename Config >
+      [[nodiscard]] static bool match( ParseInput& in, Result& res, const Config& cfg )
+      {
+         return Control< internal::expression< Literal, Identifier > >::template match< A, M, Action, Control >( in, res, cfg, 0 );
+      }
+   };
+
+}  // namespace TAO_PEGTL_NAMESPACE::expression
+
+namespace application
+{
+   namespace pegtl = TAO_PEGTL_NAMESPACE;
+
+   struct term_t;
+
+   using tuple_t = std::tuple< std::string, std::vector< term_t > >;
+   using variant_t = std::variant< std::int64_t, std::string, tuple_t >;
+
+   struct term_t
+   {
+      explicit term_t( const std::int64_t l ) noexcept
+         : variant( l )
+      {}
+
+      explicit term_t( std::string&& s ) noexcept
+         : variant( std::move( s ) )
+      {}
+
+      explicit term_t( variant_t&& v ) noexcept
+         : variant( std::move( v ) )
+      {}
+
+      variant_t variant;
+   };
+
+   [[nodiscard]] inline std::string operator+( const char* l, const std::string_view r )
+   {
+      return std::string( l ) + " '" + std::string( r ) + "'";
+   }
+
+   struct result
+   {
+      void infix( const std::string_view op )
+      {
+         assert( term_stack.size() >= 2 );
+         {
+            variant_t tmp = tuple_t( "infix" + op, { std::move( term_stack.at( term_stack.size() - 2 ) ), std::move( term_stack.at( term_stack.size() - 1 ) ) } );
+            term_stack.pop_back();
+            term_stack.back().variant = std::move( tmp );
+         }
+         assert( string_stack.size() >= 2 );
+         {
+            std::string tmp = "( " + string_stack.at( string_stack.size() - 2 ) + " " + std::string( op ) + " " + string_stack.at( string_stack.size() - 1 ) + " )";
+            string_stack.pop_back();
+            string_stack.back() = std::move( tmp );
+         }
+      }
+
+      void prefix( const std::string_view op )
+      {
+         assert( term_stack.size() >= 1 );  // NOLINT(readability-container-size-empty)
+         {
+            variant_t tmp = tuple_t( "prefix" + op, { std::move( term_stack.at( term_stack.size() - 1 ) ) } );
+            term_stack.back().variant = std::move( tmp );
+         }
+         assert( string_stack.size() >= 1 );  // NOLINT(readability-container-size-empty)
+         {
+            std::string tmp = std::string( op ) + "( " + string_stack.at( string_stack.size() - 1 ) + " )";
+            string_stack.back() = std::move( tmp );
+         }
+      }
+
+      void postfix( const std::string_view op )
+      {
+         assert( term_stack.size() >= 1 );  // NOLINT(readability-container-size-empty)
+         {
+            variant_t tmp = tuple_t( "postfix" + op, { std::move( term_stack.at( term_stack.size() - 1 ) ) } );
+            term_stack.back().variant = std::move( tmp );
+         }
+         assert( string_stack.size() >= 1 );  // NOLINT(readability-container-size-empty)
+         {
+            std::string tmp = "( " + string_stack.at( string_stack.size() - 1 ) + " )" + std::string( op );
+            string_stack.back() = std::move( tmp );
+         }
+      }
+
+      void ternary( const std::string_view op, const std::string_view o2 )
+      {
+         assert( term_stack.size() >= 2 );
+         {
+            variant_t tmp = tuple_t( "ternary", { std::move( term_stack.at( term_stack.size() - 3 ) ), std::move( term_stack.at( term_stack.size() - 2 ) ), std::move( term_stack.at( term_stack.size() - 1 ) ) } );
+            term_stack.pop_back();
+            term_stack.pop_back();
+            term_stack.back().variant = std::move( tmp );
+         }
+         assert( string_stack.size() >= 2 );
+         {
+            std::string tmp = "( " + string_stack.at( string_stack.size() - 3 ) + " " + std::string( op ) + " " + string_stack.at( string_stack.size() - 2 ) + " " + std::string( o2 ) + " " + string_stack.at( string_stack.size() - 1 ) + " )";
+            string_stack.pop_back();
+            string_stack.pop_back();
+            string_stack.back() = std::move( tmp );
+         }
+      }
+
+      void call( const std::string_view op, const std::string_view o2, const std::size_t args )
+      {
+         assert( term_stack.size() > args );
+         {
+            variant_t tmp = tuple_t( "call '" + std::string( op ) + std::string( o2 ) + "'", std::vector< term_t >( term_stack.end() - args - 1, term_stack.end() ) );
+            for( std::size_t i = 0; i < args; ++i ) {
+               term_stack.pop_back();
+            }
+            term_stack.back().variant = ( std::move( tmp ) );
+         }
+         assert( string_stack.size() > args );
+         {
+            std::string tmp = *( string_stack.end() - args - 1 ) + std::string( op ) + " ";
+            for( std::size_t i = 0; i < args; ++i ) {
+               if( i > 0 ) {
+                  tmp += ", ";
+               }
+               tmp += *( string_stack.end() - args + i );
+            }
+            tmp += " " + std::string( o2 );
+            string_stack.resize( string_stack.size() - args );
+            string_stack.back() = std::move( tmp );
+         }
+      }
+
+      void number( const std::int64_t l )
+      {
+         term_stack.emplace_back( l );
+         string_stack.emplace_back( std::to_string( l ) );
+      }
+
+      void identifier( const std::string& id )
+      {
+         term_stack.emplace_back( id );
+         string_stack.emplace_back( id );
+      }
+
+      std::vector< term_t > term_stack;
+      std::vector< std::string > string_stack;
+   };
+
+   inline std::ostream& operator<<( std::ostream& o, const term_t& t );
+
+   inline std::ostream& operator<<( std::ostream& o, const tuple_t& t )
+   {
+      o << "{ " << std::get< 0 >( t );
+      for( const auto& res : std::get< 1 >( t ) ) {
+         o << " " << res;
+      }
+      o << " }";
+      return o;
+   }
+
+   inline std::ostream& operator<<( std::ostream& o, const variant_t& v )
+   {
+      std::visit( [ & ]( const auto& t ) { o << t; }, v );
+      return o;
+   }
+
+   inline std::ostream& operator<<( std::ostream& o, const term_t& t )
+   {
+      o << t.variant;
+      return o;
+   }
+
+   struct literal
+      : pegtl::plus< pegtl::digit >
+   {};
+
+   struct grammar
+      : pegtl::must< pegtl::expression::grammar< literal, pegtl::identifier >, pegtl::eof >
+   {};
+
+   template< typename Rule >
+   struct action
+      : pegtl::nothing< Rule >
+   {};
+
+   template<>
+   struct action< literal >
+   {
+      template< typename Input, typename... States >
+      static void apply( const Input& in, result& res, States&&... /*unused*/ )
+      {
+         res.number( std::stoll( in.string() ) );
+      }
+   };
+
+   template<>
+   struct action< pegtl::identifier >
+   {
+      template< typename Input, typename... States >
+      static void apply( const Input& in, result& res, States&&... /*unused*/ )
+      {
+         res.identifier( in.string() );
+      }
+   };
+
+}  // namespace application
+
+int main( int argc, char** argv )
+{
+   // if( TAO_PEGTL_NAMESPACE::analyze< application::grammar >() != 0 ) {
+   //    return 1;
+   // }
+   for( int i = 1; i < argc; ++i ) {
+      TAO_PEGTL_NAMESPACE::argv_input in( argv, i );
+      try {
+         application::result res;
+         TAO_PEGTL_NAMESPACE::parse< application::grammar, application::action >( in, res );
+         std::cout << "Input: " << argv[ i ] << std::endl;
+         assert( res.term_stack.size() == 1 );
+         assert( res.string_stack.size() == 1 );
+         std::cout << "Result: " << res.string_stack.at( 0 ) << std::endl;
+         std::cout << "Result: " << res.term_stack.at( 0 ) << std::endl;
+      }
+      catch( const TAO_PEGTL_NAMESPACE::parse_error& e ) {
+         const auto p = e.positions().front();
+         std::cerr << e.what() << '\n'
+                   << in.line_at( p ) << '\n'
+                   << std::setw( p.column ) << '^' << '\n';
+      }
+   }
+   return 0;
+}
+
+#endif
diff --git a/packages/PEGTL/src/example/pegtl/json_classes.hpp b/packages/PEGTL/src/example/pegtl/json_classes.hpp
index c9f26de16fc3945cea12f3dcd780e89e47f5c72a..3e69515b0e35066c3c6b6f141a1a0c91e85a7f11 100644
--- a/packages/PEGTL/src/example/pegtl/json_classes.hpp
+++ b/packages/PEGTL/src/example/pegtl/json_classes.hpp
@@ -35,8 +35,8 @@ namespace example
       json_base( const json_base& ) = delete;
       json_base( json_base&& ) = delete;
 
-      void operator=( const json_base& ) = delete;
-      void operator=( json_base&& ) = delete;
+      json_base& operator=( const json_base& ) = delete;
+      json_base& operator=( json_base&& ) = delete;
 
       virtual void stream( std::ostream& ) const = 0;
 
diff --git a/packages/PEGTL/src/example/pegtl/json_parse.cpp b/packages/PEGTL/src/example/pegtl/json_parse.cpp
index f3c572d12e984765fac7715e966c20a13dfd4248..240a8898c38744eff056e5cd885a4464e93450de 100644
--- a/packages/PEGTL/src/example/pegtl/json_parse.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_parse.cpp
@@ -6,6 +6,7 @@
 
 #include <tao/pegtl.hpp>
 #include <tao/pegtl/contrib/json.hpp>
+#include <tao/pegtl/contrib/limit_depth.hpp>
 #include <tao/pegtl/contrib/trace.hpp>
 
 #include "json_errors.hpp"
@@ -16,6 +17,15 @@ namespace example
 {
    using grammar = pegtl::seq< pegtl::json::text, pegtl::eof >;
 
+   template< typename >
+   struct action
+   {};
+
+   template<>
+   struct action< pegtl::json::value >
+      : pegtl::limit_depth< 42 >
+   {};
+
 }  // namespace example
 
 int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
@@ -30,7 +40,7 @@ int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
    pegtl::argv_input in( argv, 1 );
 #if defined( __cpp_exceptions )
    try {
-      pegtl::parse< example::grammar, pegtl::nothing, example::control >( in );
+      pegtl::parse< example::grammar, example::action, example::control >( in );
    }
    catch( const pegtl::parse_error& e ) {
       const auto p = e.positions().front();
@@ -40,7 +50,7 @@ int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
       return 1;
    }
 #else
-   if( !pegtl::parse< example::grammar, pegtl::nothing, example::control >( in ) ) {
+   if( !pegtl::parse< example::grammar, example::action, example::control >( in ) ) {
       std::cerr << "error occurred" << std::endl;
       return 1;
    }
diff --git a/packages/PEGTL/src/example/pegtl/skipper.cpp b/packages/PEGTL/src/example/pegtl/skipper.cpp
deleted file mode 100644
index fd11200bbd16442ff4d90ddd815bc8664b625fb4..0000000000000000000000000000000000000000
--- a/packages/PEGTL/src/example/pegtl/skipper.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
-
-#include <iostream>
-#include <string>
-
-#include <tao/pegtl.hpp>
-#include <tao/pegtl/contrib/skip.hpp>
-
-namespace pegtl = TAO_PEGTL_NAMESPACE;
-
-namespace demo
-{
-   // define your grammar without concerning yourself with skipping whitespace
-
-   // clang-format off
-   struct key : pegtl::identifier {};
-   struct value : pegtl::identifier {};
-   struct assign : pegtl::one< '=' > {};
-   struct grammar : pegtl::seq< key, assign, value, pegtl::eof > {};
-   // clang-format on
-
-   // define your actions as usual
-
-   template< typename Rule >
-   struct action
-   {};
-
-   template<>
-   struct action< key >
-   {
-      template< typename ActionInput >
-      static void apply( const ActionInput& in )
-      {
-         std::cout << "key: '" << in.string() << "'\n";
-      }
-   };
-
-   template<>
-   struct action< value >
-   {
-      template< typename ActionInput >
-      static void apply( const ActionInput& in )
-      {
-         std::cout << "value: '" << in.string() << "'\n";
-      }
-   };
-
-   // now specify how and where to skip whitespace
-
-   using skip_how = pegtl::star< pegtl::sor< pegtl::space, pegtl::eol > >;
-
-   // clang-format off
-   template< typename > inline constexpr bool where = false;
-   template<> inline constexpr bool where< key > = true;
-   template<> inline constexpr bool where< value > = true;
-   // clang-format on
-
-   // as 'skip<>' can not take 'where' as a template parameter directly, we need to wrap it.
-   struct skip_where
-   {
-      // when to skip
-      template< typename Rule >
-      static constexpr auto value = where< Rule >;
-   };
-
-   template< typename Rule >
-   using control = pegtl::skip< skip_how, skip_where >::control< Rule >;
-
-   // The above is a first step, the helper (skip<>) may change in the future
-   // to allow separate skipping before and after and also support different skip
-   // rules for each rule/set-of-rules. Also, a more convenient way to specify
-   // the whole thing.
-
-}  // namespace demo
-
-int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
-{
-   if( argc > 1 ) {
-      pegtl::argv_input in( argv, 1 );
-      if( pegtl::parse< demo::grammar, demo::action, demo::control >( in ) ) {
-         std::cout << "success!" << std::endl;
-      }
-      else {
-         std::cerr << "failure." << std::endl;
-      }
-   }
-}
diff --git a/packages/PEGTL/src/example/pegtl/token_input.cpp b/packages/PEGTL/src/example/pegtl/token_input.cpp
index 6f03dad7ec91668e27c9fe9b3157a9d21f0cdf95..7d2795a3fb9bf453bbb3aa222ab2185d2a2b9517 100644
--- a/packages/PEGTL/src/example/pegtl/token_input.cpp
+++ b/packages/PEGTL/src/example/pegtl/token_input.cpp
@@ -35,8 +35,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~token_action_input() = default;
 
-      token_action_input operator=( const token_action_input& ) = delete;
-      token_action_input operator=( token_action_input&& ) = delete;
+      token_action_input& operator=( const token_action_input& ) = delete;
+      token_action_input& operator=( token_action_input&& ) = delete;
 
       [[nodiscard]] const iterator_t& iterator() const noexcept
       {
@@ -101,8 +101,8 @@ namespace TAO_PEGTL_NAMESPACE
 
       ~token_parse_input() = default;
 
-      token_parse_input operator=( const token_parse_input& ) = delete;
-      token_parse_input operator=( token_parse_input&& ) = delete;
+      token_parse_input& operator=( const token_parse_input& ) = delete;
+      token_parse_input& operator=( token_parse_input&& ) = delete;
 
       void discard() const noexcept {}
 
diff --git a/packages/PEGTL/src/test/pegtl/CMakeLists.txt b/packages/PEGTL/src/test/pegtl/CMakeLists.txt
index 889b0520af98201aeeead2caaeef175fa0503578..6bfacb4a72caec4e1dd08bdaade1d934e3bac9e0 100644
--- a/packages/PEGTL/src/test/pegtl/CMakeLists.txt
+++ b/packages/PEGTL/src/test/pegtl/CMakeLists.txt
@@ -23,6 +23,7 @@ set(test_sources
   change_action_and_states.cpp
   change_state.cpp
   change_states.cpp
+  check_bytes.cpp
   contains.cpp
   contrib_alphabet.cpp
   contrib_analyze.cpp
@@ -65,6 +66,8 @@ set(test_sources
   internal_endian.cpp
   internal_file_mapper.cpp
   internal_file_opener.cpp
+  limit_bytes.cpp
+  limit_depth.cpp
   parse_error.cpp
   pegtl_string_t.cpp
   position.cpp
diff --git a/packages/PEGTL/src/test/pegtl/check_bytes.cpp b/packages/PEGTL/src/test/pegtl/check_bytes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b773abed9b0d168bc4502b779617a38a5a3051b0
--- /dev/null
+++ b/packages/PEGTL/src/test/pegtl/check_bytes.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#include <tao/pegtl/contrib/check_bytes.hpp>
+
+#include "test.hpp"
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   struct test_rule
+      : star< alpha >
+   {};
+
+   struct test_grammar
+      : seq< test_rule, eof >
+   {};
+
+   template< typename Rule >
+   struct test_action
+      : nothing< Rule >
+   {};
+
+   template<>
+   struct test_action< test_rule >
+      : check_bytes< 5 >
+   {};
+
+   void unit_test()
+   {
+      memory_input<> i1( "aaa", __FUNCTION__ );
+      const auto r1 = pegtl::parse< test_grammar >( i1 );
+      TAO_PEGTL_TEST_ASSERT( r1 );
+
+      memory_input<> i2( "aaaaaaaaaaa", __FUNCTION__ );
+      const auto r2 = pegtl::parse< test_grammar >( i2 );
+      TAO_PEGTL_TEST_ASSERT( r2 );
+
+      memory_input<> i3( "aaa", __FUNCTION__ );
+      const auto r3 = pegtl::parse< test_grammar, test_action >( i3 );
+      TAO_PEGTL_TEST_ASSERT( r3 );
+
+#if defined( __cpp_exceptions )
+      memory_input<> i4( "aaaaaaaaaaa", __FUNCTION__ );
+      TAO_PEGTL_TEST_THROWS( pegtl::parse< test_grammar, test_action >( i4 ) );
+#endif
+   }
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#include "main.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp b/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
index 5c6c268cb6ee4b8bbc27cde7eb2ce779459875c8..fc85dec752d01633ddf3bf9f45642d84b4c896ea 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
@@ -32,8 +32,8 @@ namespace TAO_PEGTL_NAMESPACE
          dtor = true;
       }
 
-      void operator=( test_class&& ) = delete;
-      void operator=( const test_class& ) = delete;
+      test_class& operator=( test_class&& ) = delete;
+      test_class& operator=( const test_class& ) = delete;
    };
 
    using test_grammar = sor< alpha, digit >;
diff --git a/packages/PEGTL/src/test/pegtl/limit_bytes.cpp b/packages/PEGTL/src/test/pegtl/limit_bytes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..94e66c3f69bebc1748b9d86a322ee5518012492c
--- /dev/null
+++ b/packages/PEGTL/src/test/pegtl/limit_bytes.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#include <tao/pegtl/contrib/limit_bytes.hpp>
+
+#include "test.hpp"
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   struct test_rule
+      : star< alpha >
+   {};
+
+   struct test_grammar
+      : seq< test_rule, eof >
+   {};
+
+   template< typename Rule >
+   struct test_action
+      : nothing< Rule >
+   {};
+
+   template<>
+   struct test_action< test_rule >
+      : limit_bytes< 5 >
+   {};
+
+   void unit_test()
+   {
+      memory_input<> i1( "aaa", __FUNCTION__ );
+      const auto r1 = pegtl::parse< test_grammar >( i1 );
+      TAO_PEGTL_TEST_ASSERT( r1 );
+
+      memory_input<> i2( "aaaaaaaaaaa", __FUNCTION__ );
+      const auto r2 = pegtl::parse< test_grammar >( i2 );
+      TAO_PEGTL_TEST_ASSERT( r2 );
+
+      memory_input<> i3( "aaa", __FUNCTION__ );
+      const auto r3 = pegtl::parse< test_grammar, test_action >( i3 );
+      TAO_PEGTL_TEST_ASSERT( r3 );
+
+#if defined( __cpp_exceptions )
+      memory_input<> i4( "aaaaaaaaaaa", __FUNCTION__ );
+      TAO_PEGTL_TEST_THROWS( pegtl::parse< test_grammar, test_action >( i4 ) );
+#endif
+   }
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#include "main.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/limit_depth.cpp b/packages/PEGTL/src/test/pegtl/limit_depth.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fb1b2713c3091278d716a4857a9f34e3e1f2fe4
--- /dev/null
+++ b/packages/PEGTL/src/test/pegtl/limit_depth.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+
+#include <tao/pegtl/contrib/limit_depth.hpp>
+
+#include "test.hpp"
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   struct test_recursive
+      : seq< alpha, opt< test_recursive > >
+   {};
+
+   struct test_grammar
+      : seq< test_recursive, eof >
+   {};
+
+   template< typename Rule >
+   struct test_action
+      : nothing< Rule >
+   {};
+
+   template<>
+   struct test_action< test_recursive >
+      : limit_depth< 5 >
+   {};
+
+   void unit_test()
+   {
+      memory_input<> i1( "aaa", __FUNCTION__ );
+      const auto r1 = pegtl::parse< test_grammar >( i1 );
+      TAO_PEGTL_TEST_ASSERT( r1 );
+
+      memory_input<> i2( "aaaaaaaaaaa", __FUNCTION__ );
+      const auto r2 = pegtl::parse< test_grammar >( i2 );
+      TAO_PEGTL_TEST_ASSERT( r2 );
+
+      memory_input<> i3( "aaa", __FUNCTION__ );
+      const auto r3 = pegtl::parse< test_grammar, test_action >( i3 );
+      TAO_PEGTL_TEST_ASSERT( r3 );
+
+#if defined( __cpp_exceptions )
+      memory_input<> i4( "aaaaaaaaaaa", __FUNCTION__ );
+      TAO_PEGTL_TEST_THROWS( pegtl::parse< test_grammar, test_action >( i4 ) );
+#endif
+   }
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#include "main.hpp"
diff --git a/packages/kokkos/.github/workflows/cancelling.yml b/packages/kokkos/.github/workflows/cancelling.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fa30adf956e1c272c1b8d29d131f225b1ff94919
--- /dev/null
+++ b/packages/kokkos/.github/workflows/cancelling.yml
@@ -0,0 +1,20 @@
+name: cancel-builds-on-update
+on:
+  workflow_run:
+    workflows: ['github-Linux', 'github-OSX']
+    types: ['requested']
+
+jobs:
+  cancel-duplicate-workflow-runs:
+    name: "Cancel duplicate workflow runs"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: potiuk/cancel-workflow-runs@master
+        name: "Cancel duplicate workflow runs"
+        with:
+          cancelMode: duplicates
+          cancelFutureDuplicates: true
+          token: ${{ secrets.GITHUB_TOKEN }}
+          sourceRunId: ${{ github.event.workflow_run.id }}
+          notifyPRCancel: true
+          skipEventTypes: '["push", "schedule"]'
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0e5f523ccf77014b18a034659b450f7036901747
--- /dev/null
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -0,0 +1,72 @@
+name: github-Linux
+on: [push, pull_request]
+
+jobs:
+  CI:
+    continue-on-error: true
+    strategy:
+      matrix:
+        distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest']
+        cxx: ['g++', 'clang++']
+        cmake_build_type: ['Release', 'Debug']
+        openmp: ['ON']
+        include:
+          - distro: 'fedora:intel'
+            cxx: 'icpc'
+            cmake_build_type: 'Release'
+            openmp: 'ON'
+          - distro: 'fedora:intel'
+            cxx: 'icpc'
+            cmake_build_type: 'Debug'
+            openmp: 'ON'
+          - distro: 'fedora:intel-oneapi'
+            cxx: 'icpx'
+            cmake_build_type: 'Release'
+            openmp: 'ON'
+          - distro: 'fedora:intel-oneapi'
+            cxx: 'icpx'
+            cmake_build_type: 'Debug'
+            openmp: 'ON'
+    runs-on: ubuntu-latest
+    container: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2.2.0
+      - uses: actions/cache@v2
+        with:
+          path: ~/.ccache
+          key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }}
+          restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}}
+      - name: Get trial license
+        if: ${{ matrix.cxx == 'icpc' }}
+        run: |
+          mkdir ~/Licenses
+          curl https://dynamicinstaller.intel.com/api/v2/license > ~/Licenses/intel.lic
+      - name: maybe_disable_death_tests
+        if: ${{ matrix.distro == 'fedora:rawhide' }}
+        run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV
+      - name: build-and-test
+        run: |
+          ccache -z
+          cmake \
+            -DCMAKE_INSTALL_PREFIX=/usr \
+            -DKokkos_ENABLE_HWLOC=ON \
+            -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \
+            -DKokkos_ENABLE_TESTS=ON \
+            -DKokkos_ENABLE_EXAMPLES=ON \
+            -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
+            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
+            -DBUILD_NAME=${{ matrix.distro }}-${{ matrix.cxx }} \
+            -DBUILD_JOBS=2 -DBINARY_DIR=builddir -DSITE=GitHub-Linux \
+            -P cmake/KokkosCI.cmake
+          ccache -s
+      - name: Test DESTDIR Install
+        run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install
+      - name: Install
+        run: sudo cmake --build builddir --target install
+      - name: Test install
+        working-directory: example/build_cmake_installed
+        run: |
+          cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }}
+          cmake --build builddir
+          cmake --build builddir --target test
diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml
new file mode 100644
index 0000000000000000000000000000000000000000..855b557c829a609f34b82c7e5f307eef60cf0ede
--- /dev/null
+++ b/packages/kokkos/.github/workflows/osx.yml
@@ -0,0 +1,35 @@
+name: github-OSX
+
+on: [push, pull_request]
+
+jobs:
+  osxci:
+    name: osx-ci
+    runs-on: [macos-latest]
+
+    strategy:
+      matrix:
+        include:
+          - backend: "SERIAL"
+            cmake_build_type: "RelWithDebInfo"
+          - backend: "PTHREAD"
+            cmake_build_type: "RelWithDebInfo"
+          - backend: "SERIAL"
+            cmake_build_type: "Debug"
+          - backend: "SERIAL"
+            cmake_build_type: "Release"
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: build-and-test
+        run:
+          cmake
+            -DKokkos_ENABLE_${{ matrix.backend }}=On
+            -DCMAKE_CXX_FLAGS="-Werror"
+            -DCMAKE_CXX_STANDARD=14
+            -DKokkos_ENABLE_COMPILER_WARNINGS=ON
+            -DKokkos_ENABLE_TESTS=On
+            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
+            -DBUILD_NAME=macOS-${{ matrix.backend }}
+            -DTARGET=install -DBUILD_JOBS=2 -DSITE=GitHub-OSX
+            -P cmake/KokkosCI.cmake
diff --git a/packages/kokkos/.gitignore b/packages/kokkos/.gitignore
index ec6f3487c9f83de8de9977890352fb9ca702255b..eb2257762bdbc1a0536bb04ef935d94387a5578d 100644
--- a/packages/kokkos/.gitignore
+++ b/packages/kokkos/.gitignore
@@ -12,3 +12,12 @@ testing/
 /out/build
 /CMakeSettings.json
 /out/mytest
+
+# build directories in source tree
+/build*
+
+# IDE-specific files/folders
+## VSCode
+/.vscode
+## QtCreator
+/CMakeLists.txt.user*
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index deecc77edad38b350f7bf85fc5514fde2e1893ad..6dd4101e5bdf1210d26ef2ff0a34f557416c532b 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = 1fb0c284d458c75370094921d9f202c287502325
-	parent = 55da1f845ac4f9ea049f2d6a97c7edef95a887ab
-	cmdver = 0.4.3
+	commit = 4b97a22ff7be7635116930bb97173058d6079202
+	parent = f2fc77ba9037b2a2032ab980fb445175441f6d1f
 	method = merge
+	cmdver = 0.4.3
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index 889abe33f8305a9f3053079f2dffac4be7abd28e..001171d648e7cfb2236d17439720562707faaab4 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -5,6 +5,8 @@ pipeline {
         CCACHE_DIR = '/tmp/ccache'
         CCACHE_MAXSIZE = '10G'
         CCACHE_CPP2 = 'true'
+        BUILD_JOBS = 8
+        SITE = 'Jenkins'
     }
     stages {
         stage('Clang-Format') {
@@ -28,25 +30,27 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.sycl'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=intel/oneapi-basekit:devel-ubuntu18.04'
-                            label 'docker'
+                            label 'nvidia-docker && volta'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache'
                         }
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
-                                -DCMAKE_BUILD_TYPE=Debug \
+                                -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
                                 -DCMAKE_CXX_COMPILER=clang++ \
-                                -DCMAKE_CXX_FLAGS="-Werror" \
+                                -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version -Wno-gnu-zero-variadic-macro-arguments" \
+                                -DKokkos_ARCH_VOLTA70=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_EXAMPLES=ON \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_SYCL=ON \
+                                -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                              .. && \
-                              make -j8 && ctest --verbose'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake'''
                     }
                     post {
                         always {
@@ -68,11 +72,12 @@ pipeline {
                         OMP_NUM_THREADS = 8
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
+                        LC_ALL = 'C'
                     }
                     steps {
                         sh 'ccache --zero-stats'
                         sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER=hipcc \
@@ -83,8 +88,8 @@ pipeline {
                                 -DKokkos_ENABLE_HIP=ON \
                                 -DKokkos_ARCH_VEGA906=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
-                              .. && \
-                              make -j8 && ctest --verbose'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake'''
                     }
                     post {
                         always {
@@ -102,9 +107,12 @@ pipeline {
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
                     }
+                    environment {
+                        LC_ALL = 'C'
+                    }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=hipcc \
@@ -114,8 +122,8 @@ pipeline {
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_HIP=ON \
                                 -DKokkos_ARCH_VEGA906=ON \
-                              .. && \
-                              make -j8 && ctest --verbose'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake'''
                     }
                     post {
                         always {
@@ -134,19 +142,19 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
-                                -DCMAKE_BUILD_TYPE=Debug \
+                                -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=clang++ \
-                                -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror" \
+                                -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ENABLE_OPENMPTARGET=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                              .. && \
-                              make -j8 && ctest --verbose'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake'''
                     }
                     post {
                         always {
@@ -165,7 +173,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \
@@ -179,8 +187,8 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
-                              .. && \
-                              make -j8 && ctest --verbose'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake'''
                     }
                     post {
                         always {
@@ -222,7 +230,7 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran" --build-arg CMAKE_VERSION=3.17.3'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3'
                             label 'nvidia-docker && volta'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
@@ -236,7 +244,7 @@ pipeline {
                     steps {
                         sh 'ccache --zero-stats'
                         sh '''rm -rf install && mkdir -p install && \
-                              rm -rf build && mkdir -p build && cd build && \
+                              rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER=g++-8 \
@@ -248,11 +256,10 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA_LAMBDA=OFF \
                                 -DKokkos_ENABLE_CUDA_UVM=ON \
                                 -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
-                                -DKokkos_ARCH_VOLTA70=ON \
-                                -DCMAKE_INSTALL_PREFIX=${PWD}/../install \
-                              .. && \
-                              make -j8 install && \
-                              cd .. && \
+                                -DCMAKE_INSTALL_PREFIX=${PWD}/install \
+                                -DBUILD_NAME=${STAGE_NAME} \
+                                -DTARGET=install \
+                              -P cmake/KokkosCI.cmake && \
                               rm -rf build-tests && mkdir -p build-tests && cd build-tests && \
                               export CMAKE_PREFIX_PATH=${PWD}/../install && \
                               cmake \
@@ -271,7 +278,11 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=17 \
                               .. && \
-                              make -j8 && ctest --verbose'''
+                              make -j8 && ctest --verbose && \
+                              cd ../.. && \
+                              cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \
+                              cmake --build build_cmake_installed_different_compiler/build --target all && \
+                              cmake --build build_cmake_installed_different_compiler/build --target test'''
                     }
                     post {
                         always {
@@ -284,14 +295,14 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel --build-arg CMAKE_VERSION=3.15.5'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel'
                             label 'nvidia-docker && volta'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -305,9 +316,10 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                              .. && \
-                              make -j8 && ctest --verbose && \
-                              cd ../example/build_cmake_in_tree && \
+                                -DBUILD_NAME=${STAGE_NAME} \
+                                -DTARGET=install \
+                              -P cmake/KokkosCI.cmake && \
+                              cd example/build_cmake_in_tree && \
                               rm -rf build && mkdir -p build && cd build && \
                               cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose'''
                     }
@@ -330,7 +342,7 @@ pipeline {
                         OMP_PROC_BIND = 'true'
                     }
                     steps {
-                        sh '''rm -rf build && mkdir -p build && cd build && \
+                        sh '''rm -rf build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_STANDARD=14 \
@@ -339,8 +351,9 @@ pipeline {
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                              .. && \
-                              make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c'''
+                                -DBUILD_NAME=${STAGE_NAME} \
+                              -P cmake/KokkosCI.cmake && \
+                              gcc -I$PWD/core/src core/unit_test/tools/TestCInterface.c'''
                     }
                 }
             }
diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml
index d156e91ee0984af24c0127c52fcba674bce7fa82..04ef01c1602cf87aae3e39225037d65f49651f62 100644
--- a/packages/kokkos/.travis.yml
+++ b/packages/kokkos/.travis.yml
@@ -4,7 +4,6 @@ language: cpp
 
 os:
   - linux
-  - osx
 
 compiler:
   - gcc
@@ -30,7 +29,7 @@ branches:
   - /^release/
 
 env:
-  - 
+  -
 #  - BACKEND="OPENMP"
   - BACKEND="PTHREAD"
   - CMAKE_BUILD_TYPE=Debug COVERAGE=yes GTEST_FILTER="-*DeathTest*"
@@ -42,38 +41,40 @@ env:
 
 matrix:
   exclude:
-# Apple GCC is just an alias to AppleClang
-    - os: osx
-      compiler: gcc
-# Apple Clang doesn't support OpenMP
-    - os: osx
-      compiler: clang
-      env: CMAKE_BUILD_TYPE=Debug BACKEND="OPENMP" COVERAGE=yes GTEST_FILTER="-*DeathTest*"
-    - os: osx
-      compiler: clang
-      env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP"
-# We do this as canary
     - os: linux
       compiler: gcc
       env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP"
 
+# Install newer CMake. The distribution comes with CMake 3.12.4 but we require at least 3.16
+install:
+  - CMAKE_VERSION=3.17.1
+  - CMAKE_DIR=/opt/cmake
+  - CMAKE_KEY=2D2CEF1034921684 &&
+    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} &&
+    CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh &&
+    CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt &&
+    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} &&
+    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc &&
+    wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} &&
+    #gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} &&
+    #gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} &&
+    #grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check &&
+    mkdir -p ${CMAKE_DIR} &&
+    sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} &&
+    rm cmake*
+  - PATH=${CMAKE_DIR}/bin:$PATH
+  - cd ${TRAVIS_BUILD_DIR}
+
 before_script:
-  - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
-      brew update;
-      export HOMEBREW_NO_AUTO_UPDATE=1;
-      brew ls --versions ccache   > /dev/null || brew install ccache;
-      export PATH=/usr/local/opt/ccache/libexec:$PATH;
-      export CXXFLAGS="${CXXFLAGS} -Wno-unused-command-line-argument";
-      if [[ ${BACKEND} == "OPENMP" ]]; then brew install libomp; fi
-    fi
   - ccache -z
-  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; fi
+  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; export BUILD_NAME_SUFFIX="-Coverage"; fi
   - if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi
 
 script:
   - export OMP_NUM_THREADS=2
   - export OMP_PLACES=threads
   - export OMP_PROC_BIND=spread
+  - export BUILD_JOBS=2
   # LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613
   - if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi
   # enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning
@@ -81,17 +82,17 @@ script:
       ln -s /usr/bin/ccache $HOME/bin/clang++;
       export CCACHE_CPP2=yes;
     fi
-  - mkdir build &&
-    pushd build &&
-    cmake ..
+  - cmake
           ${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On}
           -DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror"
           -DCMAKE_CXX_STANDARD=14
           -DKokkos_ENABLE_COMPILER_WARNINGS=ON
           -DKokkos_ENABLE_TESTS=On
-          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} &&
-    make VERBOSE=1 -j2 &&
-    travis_wait 60 make test CTEST_OUTPUT_ON_FAILURE=1 &&
+          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}}
+          -DBUILD_NAME="${CC}-${BACKEND}${BUILD_NAME_SUFFIX}"
+          -DSITE=Travis
+          -P cmake/KokkosCI.cmake &&
+    pushd build &&
     make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr &&
     popd
 
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index c759181aa21ec3086507d678ecb9955ae4828681..3ce38c37d866dacc25528f5597461e7629175e00 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,168 @@
 # Change Log
 
+## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00)
+
+**Highlights:**
+- SYCL Backend Almost Feature Complete
+- OpenMPTarget Backend Almost Feature Complete
+- Performance Improvements for HIP backend
+- Require CMake 3.16 or newer
+- Tool Callback Interface Enhancements
+- cmath wrapper functions available now in Kokkos::Experimental
+
+**Features:**
+- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861)
+- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849)
+- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845)
+- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833)
+- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823)
+- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822)
+- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818)
+- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815)
+- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801)
+- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786)
+- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783)
+- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781)
+- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655)
+- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654)
+- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611)
+
+**Implemented enhancements Backends and Archs:**
+- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918)
+- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916)
+- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908)
+- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902)
+- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898)
+- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893)
+- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866)
+- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873)
+- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826)
+- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824)
+- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811)
+- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808)
+- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802)
+- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777)
+- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776)
+- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767)
+- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763)
+- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759)
+- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754)
+- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746)
+- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733)
+- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732)
+- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703)
+- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697)
+- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677)
+- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671)
+- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626)
+- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564)
+- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511)
+
+**Implemented enhancements BuildSystem:**
+- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912)
+- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874)
+- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869)
+- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862)
+- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846)
+- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843)
+- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842)
+- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834)
+- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809)
+- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779)
+- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778)
+- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774)
+- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772)
+- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770)
+- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753)
+- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741)
+- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735)
+- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722)
+- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713)
+- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706)
+- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699)
+- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698)
+- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679)
+- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844)
+
+**Implemented enhancements Tools:**
+- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860)
+- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812)
+- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721)
+- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711)
+- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688)
+- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627)
+- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481)
+
+**Implemented enhancements Other:**
+- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921)
+- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909)
+- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904)
+- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891)
+- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890)
+- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880)
+- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851)
+- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857)
+- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829)
+- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825)
+- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810)
+- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729)
+- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714)
+- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658)
+
+**Fixed bugs:**
+- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911)
+- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882)
+- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881)
+- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868)
+- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866)
+- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859)
+- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856)
+- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854)
+- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852)
+- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848)
+- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803)
+- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800)
+- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797)
+- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782)
+- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780)
+- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755)
+- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752)
+- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751)
+- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748)
+- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724)
+- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719)
+- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718)
+- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712)
+- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701)
+- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696)
+
+**Incompatibilities:**
+- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919)
+- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896)
+- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884)
+- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883)
+- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863)
+- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847)
+- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836)
+- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813)
+- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793)
+- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788)
+- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678)
+- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533)
+
+**Enabled tests:**
+- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903)
+- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900)
+- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853)
+- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844)
+- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839)
+- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830)
+- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744)
+- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742)
+- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692)
+- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691)
+
 ## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index 7bc3c7725648d71c6703fd345ce23d159d40c1f8..6fc1bf7d2f7fd3b02a785b1184923cde07b438b2 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -72,7 +72,7 @@ ENDFUNCTION()
 LIST(APPEND CMAKE_MODULE_PATH cmake/Modules)
 
 IF(NOT KOKKOS_HAS_TRILINOS)
-  cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+  cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
   set(CMAKE_DISABLE_SOURCE_CHANGES ON)
   set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
   IF (Spack_WORKAROUND)
@@ -111,27 +111,25 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 3)
-set(Kokkos_VERSION_PATCH 1)
+set(Kokkos_VERSION_MINOR 4)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
-IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
-  MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-  CMAKE_POLICY(SET CMP0074 NEW)
-ENDIF()
+MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+CMAKE_POLICY(SET CMP0074 NEW)
 
 # Load either the real TriBITS or a TriBITS wrapper
 # for certain utility functions that are universal (like GLOBAL_SET)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake)
 
-IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0")
-  #If we are building CUDA, we have tricked CMake because we declare a CXX project
-  #If the default C++ standard for a given compiler matches the requested
-  #standard, then CMake just omits the -std flag in later versions of CMake
-  #This breaks CUDA compilation (CUDA compiler can have a different default
-  #-std then the underlying host compiler by itself). Setting this variable
-  #forces CMake to always add the -std flag even if it thinks it doesn't need it
+IF (Kokkos_ENABLE_CUDA)
+  # If we are building CUDA, we have tricked CMake because we declare a CXX project
+  # If the default C++ standard for a given compiler matches the requested
+  # standard, then CMake just omits the -std flag in later versions of CMake
+  # This breaks CUDA compilation (CUDA compiler can have a different default
+  # -std then the underlying host compiler by itself). Setting this variable
+  # forces CMake to always add the -std flag even if it thinks it doesn't need it
   GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98)
 ENDIF()
 
@@ -139,15 +137,19 @@ ENDIF()
 # I really wish these were regular variables
 # but scoping issues can make it difficult
 GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
-GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE)
+GLOBAL_SET(KOKKOS_LINK_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
 GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
 # We need to append text here for making sure TPLs
 # we import are available for an installed Kokkos
 GLOBAL_SET(KOKKOS_TPL_EXPORTS)
-# this could probably be scoped to project
+# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler
 GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE)
+# MSVC never goes through kokkos_launch_compiler
+IF(NOT MSVC)
+    GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE)
+ENDIF()
 
 # Include a set of Kokkos-specific wrapper functions that
 # will either call raw CMake or TriBITS
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index 3b6a5ff4368c966a8a44f04bdbe64c9fceb3745b..2599121d70ada48567c61fdc63ba94925a402267 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -1,8 +1,8 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 3
-KOKKOS_VERSION_PATCH = 1
+KOKKOS_VERSION_MINOR = 4
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,OpenMP,Pthread,Serial
@@ -10,7 +10,7 @@ KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MIN
 KOKKOS_DEVICES ?= "Pthread"
 # Options: 
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908
@@ -154,17 +154,17 @@ KOKKOS_INTERNAL_OS_DARWIN      := $(call kokkos_has_string,$(KOKKOS_OS),Darwin)
 KOKKOS_CXX_VERSION                   := $(strip $(shell $(CXX) --version       2>&1))
 KOKKOS_INTERNAL_COMPILER_INTEL       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation)
 KOKKOS_INTERNAL_COMPILER_PGI         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI)
-KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l))
-KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l))
-KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l)>0" | bc))
+KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2>&1 | grep -c XL))
+KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
+KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
-KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM)
+KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
 
 # Check Host Compiler if using NVCC through nvcc_wrapper
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-  KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l))
+  KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper))
   ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1)
 
     KOKKOS_CXX_HOST_VERSION             := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version       2>&1))
@@ -287,11 +287,11 @@ else
       #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z
       #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a
     else
-      KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14
-      KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y
-      KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17
-      KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
-      KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a
+      KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
+      KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
+      KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
+      KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z
+      KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
     endif
   endif
 endif
@@ -322,6 +322,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
 KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
 KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
+KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
@@ -334,7 +335,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70)   \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80))
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
 
 #SEK: This seems like a bug to me
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -575,10 +577,10 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING")
 endif
 
-tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL")
+tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL")
 
 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
   ifneq ($(KOKKOS_CMAKE), yes)
@@ -742,6 +744,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1)
 
   KOKKOS_CXXFLAGS += -march=armv8.2-a+sve
   KOKKOS_LDFLAGS += -march=armv8.2-a+sve
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    KOKKOS_CXXFLAGS += -msve-vector-bits=512
+    KOKKOS_LDFLAGS += -msve-vector-bits=512
+  endif
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+    KOKKOS_CXXFLAGS += -msve-vector-bits=512
+    KOKKOS_LDFLAGS += -msve-vector-bits=512
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
@@ -1090,6 +1100,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
+  endif
 
   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
@@ -1149,7 +1164,7 @@ endif
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
-  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define))
 else
   KOKKOS_INTERNAL_NEW_CONFIG := 1
 endif
@@ -1171,41 +1186,41 @@ tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config
 tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
 tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
    else
    endif
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"\#include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"\#include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
 endif
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
@@ -1324,7 +1339,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 endif
 
 # With Cygwin functions such as fdopen and fileno are not defined
-# when strict ansi is enabled. strict ansi gets enabled with --std=c++14
+# when strict ansi is enabled. strict ansi gets enabled with -std=c++14
 # though. So we hard undefine it here. Not sure if that has any bad side effects
 # This is needed for gtest actually, not for Kokkos itself!
 ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
index 5a03f7d17e946d4ed0792302d25a8de30a594aee..cf9fc242420e1dbbb519b3312cf1a4c3b4354738 100644
--- a/packages/kokkos/Makefile.targets
+++ b/packages/kokkos/Makefile.targets
@@ -36,6 +36,8 @@ Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
 Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
+Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 69d6cf8f35ea4705885900f9fc2bfdb608c54373..904cf5ccb967037d94ac9b4a06144a4f7333dd3d 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -668,6 +668,25 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_SYCL
+template <>
+struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
+  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  KOKKOS_FUNCTION
+  static int get_state_idx(const locks_view_type& locks_) {
+#ifdef KOKKOS_ARCH_INTEL_GEN
+    int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
+#else
+    int i = 0;
+#endif
+    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+      i = (i + 1) % static_cast<int>(locks_.extent(0));
+    }
+    return i;
+  }
+};
+#endif
+
 }  // namespace Impl
 
 template <class DeviceType>
@@ -1028,7 +1047,7 @@ class Random_XorShift1024 {
 
   KOKKOS_INLINE_FUNCTION
   double drand(const double& start, const double& end) {
-    return frand(end - start) + start;
+    return drand(end - start) + start;
   }
 
   // Marsaglia polar method for drawing a standard normal distributed random
diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
index 819c9e54bae4f293e76252679d041de44c25c051..9109837985a91ad14245133682af15aca59be503 100644
--- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -3,6 +3,7 @@
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
 
 SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest)
@@ -25,7 +26,7 @@ KOKKOS_ADD_TEST_LIBRARY(
 TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0)
 
 IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu")))
-TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)
+  TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_14)
 ENDIF()
 
 # Suppress clang-tidy diagnostics on code that we do not have control over
@@ -33,51 +34,42 @@ IF(CMAKE_CXX_CLANG_TIDY)
   SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "")
 ENDIF()
 
-SET(SOURCES
-  UnitTestMain.cpp
-)
+SET(ALGORITHM UnitTestMain.cpp)
 
 IF(Kokkos_ENABLE_OPENMP)
-  LIST( APPEND SOURCES
-    TestOpenMP.cpp
+  LIST(APPEND ALGORITHM_SOURCES
     TestOpenMP_Sort1D.cpp
     TestOpenMP_Sort3D.cpp
     TestOpenMP_SortDynamicView.cpp
-    TestOpenMP_Random.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_HIP)
-  LIST( APPEND SOURCES
-    TestHIP.cpp
   )
 ENDIF()
 
-IF(Kokkos_ENABLE_CUDA)
-  LIST( APPEND SOURCES
-    TestCuda.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_HPX)
-  LIST( APPEND SOURCES
-    TestHPX.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_SERIAL)
-  LIST( APPEND SOURCES
-    TestSerial.cpp
-  )
-ENDIF()
-
-IF(Kokkos_ENABLE_PTHREAD)
-  LIST( APPEND SOURCES
-    TestThreads.cpp
-  )
-ENDIF()
+foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL)
+  # Because there is always an exception to the rule
+  if(Tag STREQUAL "Threads")
+    set(DEVICE "PTHREAD")
+  else()
+    string(TOUPPER ${Tag} DEVICE)
+  endif()
+
+  if(Kokkos_ENABLE_${DEVICE})
+    set(dir ${CMAKE_CURRENT_BINARY_DIR})
+    set(file ${dir}/Test${Tag}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${Tag}_Category.hpp>\n"
+	  "#include <TestRandomCommon.hpp>\n"
+	  "#include <TestSortCommon.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ALGORITHM_SOURCES ${file})
+  endif()
+endforeach()
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
   UnitTest
-  SOURCES ${SOURCES}
+  SOURCES
+    UnitTestMain.cpp
+    ${ALGORITHM_SOURCES}
 )
diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile
index c112d7c6fcad3b47647078e27c3f11e9433956b5..dd0aa87de0b2c76fe76d03f8ea77092833dd9f63 100644
--- a/packages/kokkos/algorithms/unit_tests/Makefile
+++ b/packages/kokkos/algorithms/unit_tests/Makefile
@@ -20,11 +20,19 @@ override LDFLAGS += -lpthread
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
-KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files
 
 TEST_TARGETS =
 TARGETS =
 
+tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
+  $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\
+    $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \
+    $(shell echo "\#include <TestRandomCommon.hpp>" >> Test$(device).cpp); \
+    $(shell echo "\#include <TestSortCommon.hpp>" >> Test$(device).cpp); \
+  ) \
+)
+
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Cuda
@@ -44,7 +52,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	OBJ_OPENMP = TestOpenMP.o TestOpenMP_Random.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o
+	OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
 	TEST_TARGETS += test-openmp
 endif
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
index a9b2010ad025bd0c967071aca37407bea4a351bf..4a5839f0c80a5298c14ff91422d74664b9dd95bd 100644
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
@@ -59,6 +59,8 @@ TEST(openmp, SortUnsigned1D) {
   Impl::test_1D_sort<Kokkos::OpenMP, unsigned>(171);
 }
 
+TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort<Kokkos::OpenMP>(); }
+
 }  // namespace Test
 #else
 void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index caba92c152faac40d46feabb7407e1e6a4e9fb5d..1f14875096dd2fbd0bebf4feea796d4c6ccd79f0 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -491,6 +491,34 @@ void test_random(unsigned int num_draws) {
 }
 }  // namespace Impl
 
+template <typename ExecutionSpace>
+void test_random_xorshift64() {
+#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \
+    defined(KOKKOS_ENABLE_HIP)
+  const int num_draws = 132141141;
+#else  // SERIAL, HPX, OPENMP
+  const int num_draws = 10240000;
+#endif
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<
+      Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>(
+      num_draws);
+}
+
+template <typename ExecutionSpace>
+void test_random_xorshift1024() {
+#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \
+    defined(KOKKOS_ENABLE_HIP)
+  const int num_draws = 52428813;
+#else  // SERIAL, HPX, OPENMP
+  const int num_draws = 10130144;
+#endif
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>(
+      num_draws);
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<
+      Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>(
+      num_draws);
+}
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3b59ae1f12422c448a13f5f91f2ed74cc58ff
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp
@@ -0,0 +1,60 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP
+
+#include <TestRandom.hpp>
+
+namespace Test {
+
+TEST(TEST_CATEGORY, Random_XorShift64) {
+  test_random_xorshift64<TEST_EXECSPACE>();
+}
+TEST(TEST_CATEGORY, Random_XorShift1024_0) {
+  test_random_xorshift1024<TEST_EXECSPACE>();
+}
+}  // namespace Test
+
+#endif
diff --git a/packages/kokkos/containers/unit_tests/TestHIP_Category.hpp b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp
similarity index 88%
rename from packages/kokkos/containers/unit_tests/TestHIP_Category.hpp
rename to packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp
index c2d60d18148b30674de5ee559ecafc09d23d126f..56657b6574b865419a1f93e01a49aa2a3e648736 100644
--- a/packages/kokkos/containers/unit_tests/TestHIP_Category.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp
@@ -42,10 +42,14 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_TEST_HIP_HPP
-#define KOKKOS_TEST_HIP_HPP
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP
 
-#define TEST_CATEGORY hip
-#define TEST_EXECSPACE Kokkos::Experimental::HIP
+#include <TestSort.hpp>
 
+namespace Test {
+TEST(TEST_CATEGORY, SortUnsigned) {
+  Impl::test_sort<TEST_EXECSPACE, unsigned>(171);
+}
+}  // namespace Test
 #endif
diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml
index c40bf066b7a9c6e9de822cc8124147fa8f241de9..e8763c0b665c4a992f74b70eab0caa915beb33dd 100644
--- a/packages/kokkos/appveyor.yml
+++ b/packages/kokkos/appveyor.yml
@@ -3,8 +3,4 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    mkdir build &&
-    cd build &&
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON &&
-    cmake --build . --target install &&
-    ctest -C Debug -V
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler
index 1fbebf648fa0af5f0ec627b87c603c651aff65e6..d929d24f1dca42fc277940ffb27f54d374e89cd1 100755
--- a/packages/kokkos/bin/kokkos_launch_compiler
+++ b/packages/kokkos/bin/kokkos_launch_compiler
@@ -13,6 +13,17 @@
 #   $1 are 'ar', 'cmake', etc. during the linking phase
 #
 
+# emit a message about the underlying command executed
+: ${DEBUG:=0}
+: ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}}
+
+debug-message()
+{
+    if [ "${KOKKOS_DEBUG_LAUNCH_COMPILER}" -ne 0 ]; then
+        echo -e "##### $(basename ${BASH_SOURCE[0]}) executing: \"$@\"... #####"
+    fi
+}
+
 # check the arguments for the KOKKOS_DEPENDENCE compiler definition
 KOKKOS_DEPENDENCE=0
 for i in ${@}
@@ -23,16 +34,30 @@ do
     fi
 done
 
-# if C++ is not passed, someone is probably trying to invoke it directly
+# if Kokkos compiler is not passed, someone is probably trying to invoke it directly
 if [ -z "${1}" ]; then
-    echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument."
+    echo -e "\n${BASH_SOURCE[0]} was invoked without the Kokkos compiler as the first argument."
     echo "This script is not indended to be directly invoked by any mechanism other"
-    echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n"
+    echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n"
+    exit 1
+fi
+
+# if Kokkos compiler is not passed, someone is probably trying to invoke it directly
+if [ -z "${2}" ]; then
+    echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the second argument."
+    echo "This script is not indended to be directly invoked by any mechanism other"
+    echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n"
     exit 1
 fi
 
 # if there aren't two args, this isn't necessarily invalid, just a bit strange
-if [ -z "${2}" ]; then exit 0; fi
+if [ -z "${3}" ]; then exit 0; fi
+
+# store the Kokkos compiler
+KOKKOS_COMPILER=${1}
+
+# remove the Kokkos compiler from the arguments
+shift
 
 # store the expected C++ compiler
 CXX_COMPILER=${1}
@@ -40,48 +65,57 @@ CXX_COMPILER=${1}
 # remove the expected C++ compiler from the arguments
 shift
 
-# after the above shift, $1 is now the exe for the compile or link command, e.g.
-#       kokkos_launch_compiler g++ gcc -c file.c -o file.o
+# NOTE: in below, ${KOKKOS_COMPILER} is usually nvcc_wrapper
+#
+# after the above shifts, $1 is now the exe for the compile or link command, e.g.
+#       kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o
 # becomes:
 #       kokkos_launch_compiler gcc -c file.c -o file.o
-# Check to see if the executable is the C++ compiler and if it is not, then
+# We check to see if the executable is the C++ compiler and if it is not, then
 # just execute the command.
 #
 # Summary:
-#       kokkos_launch_compiler g++ gcc -c file.c -o file.o
+#       kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o
 # results in this command being executed:
 #       gcc -c file.c -o file.o
 # and
-#       kokkos_launch_compiler g++ g++ -c file.cpp -o file.o
+#       kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o
 # results in this command being executed:
-#       nvcc_wrapper -c file.cpp -o file.o
+#       ${KOKKOS_COMPILER} -c file.cpp -o file.o
 if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then
-    # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper
+    debug-message $@
+    # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER}
     eval $@
 else
-    # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper
+    # the executable is the C++ compiler, so we need to re-direct to ${KOKKOS_COMPILER}
+    if [ ! -f "${KOKKOS_COMPILER}" ]; then
+        echo -e "\nError: the compiler redirect for Kokkos was not found at ${KOKKOS_COMPILER}\n"
+        exit 1
+    fi
 
     # find the nvcc_wrapper from the same build/install
     NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper"
+    if [ "${KOKKOS_COMPILER}" = "${NVCC_WRAPPER}" ]; then
+        # this should only be valid in the install tree -- it will be set to CMAKE_CXX_COMPILER used using Kokkos installation
+        if [ -z $(echo "@NVCC_WRAPPER_DEFAULT_COMPILER@" | grep 'NVCC_WRAPPER_DEFAULT_COMPILER') ]; then
+            : ${NVCC_WRAPPER_DEFAULT_COMPILER:="@NVCC_WRAPPER_DEFAULT_COMPILER@"}
+        fi
 
-    if [ -z "${NVCC_WRAPPER}" ]; then
-        echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n"
-        exit 1
-    fi
+        # set default nvcc wrapper compiler if not specified
+        : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}}
+        export NVCC_WRAPPER_DEFAULT_COMPILER
 
-    # set default nvcc wrapper compiler if not specified
-    : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}}
-    export NVCC_WRAPPER_DEFAULT_COMPILER
-
-    # calling itself will cause an infinitely long build
-    if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then
-        echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n"
-        exit 1
+        # nvcc_wrapper calling itself will cause an infinitely long build
+        if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then
+            echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n"
+            exit 1
+        fi
     fi
 
     # discard the compiler from the command
     shift
 
-    # execute nvcc_wrapper
-    ${NVCC_WRAPPER} $@
+    debug-message ${KOKKOS_COMPILER} $@
+    # execute ${KOKKOS_COMPILER} (again, usually nvcc_wrapper)
+    ${KOKKOS_COMPILER} $@
 fi
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 4ecf4c66d5a069eba4c8ca4e379299dfb6ed53bb..5556e888e34b2f7c2dd18bdb6f47071abde0574b 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -191,11 +191,11 @@ do
     shift
     ;;
   #Handle known nvcc args
-  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
+  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
diff --git a/packages/kokkos/cmake/CTestConfig.cmake.in b/packages/kokkos/cmake/CTestConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..1f82c0d64d15e0a4fb346cfb7227be9cd41e5f17
--- /dev/null
+++ b/packages/kokkos/cmake/CTestConfig.cmake.in
@@ -0,0 +1,91 @@
+#----------------------------------------------------------------------------------------#
+#
+#   CTestConfig.cmake template for Kokkos
+#
+#----------------------------------------------------------------------------------------#
+
+#
+#   dash-board related
+#
+set(CTEST_PROJECT_NAME "Kokkos")
+set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
+set(CTEST_DROP_METHOD "https")
+set(CTEST_DROP_SITE "cdash.nersc.gov")
+set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
+set(CTEST_CDASH_VERSION "1.6")
+set(CTEST_CDASH_QUERY_VERSION TRUE)
+set(CTEST_SUBMIT_RETRY_COUNT "1")
+set(CTEST_SUBMIT_RETRY_DELAY "30")
+
+#
+#   configure/build related
+#
+set(CTEST_BUILD_NAME "@BUILD_NAME@")
+set(CTEST_MODEL "@MODEL@")
+set(CTEST_SITE "@SITE@")
+set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
+set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
+set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
+
+#
+#   configure/build related
+#
+set(CTEST_UPDATE_TYPE "git")
+set(CTEST_UPDATE_VERSION_ONLY ON)
+# set(CTEST_GENERATOR "")
+# set(CTEST_GENERATOR_PLATFORM "")
+
+#
+#   testing related
+#
+set(CTEST_TIMEOUT "7200")
+set(CTEST_TEST_TIMEOUT "7200")
+set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
+set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
+set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
+
+#
+#   coverage related
+#
+set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
+
+#
+#   commands
+#
+if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
+    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
+endif()
+set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
+set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
+set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
+if(NOT WIN32)
+    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
+endif()
+set(CTEST_COVERAGE_COMMAND "gcov")
+set(CTEST_MEMORYCHECK_COMMAND "valgrind")
+set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
+
+#
+#   various configs
+#
+set(APPEND_VALUE @APPEND@)
+if(APPEND_VALUE)
+    set(APPEND_CTEST APPEND)
+endif()
+
+macro(SET_TEST_PROP VAR)
+    if(NOT "${ARGS}" STREQUAL "")
+        set(${VAR}_CTEST ${VAR} ${ARGN})
+    endif()
+endmacro()
+
+set_test_prop(START           @START@)
+set_test_prop(END             @END@)
+set_test_prop(STRIDE          @STRIDE@)
+set_test_prop(INCLUDE         @INCLUDE@)
+set_test_prop(EXCLUDE         @EXCLUDE@)
+set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
+set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
+set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
+set_test_prop(STOP_TIME       @STOP_TIME@)
+set_test_prop(COVERAGE_LABELS @LABELS@)
diff --git a/packages/kokkos/cmake/KokkosCI.cmake b/packages/kokkos/cmake/KokkosCI.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e8c9af37ad544a93a62f498e9a903696321a1c75
--- /dev/null
+++ b/packages/kokkos/cmake/KokkosCI.cmake
@@ -0,0 +1,350 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+message(STATUS "")
+
+get_cmake_property(_cached_vars CACHE_VARIABLES)
+set(KOKKOS_CMAKE_ARGS)
+set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
+                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
+list(SORT _cached_vars)
+foreach(_var ${_cached_vars})
+    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
+        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
+        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
+            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
+        endif()
+    endif()
+endforeach()
+
+
+#----------------------------------------------------------------------------------------#
+#
+#   Macros and variables
+#
+#----------------------------------------------------------------------------------------#
+
+macro(CHECK_REQUIRED VAR)
+    if(NOT DEFINED ${VAR})
+        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
+    endif()
+endmacro()
+
+# require the build name variable
+CHECK_REQUIRED(BUILD_NAME)
+
+# uses all args
+macro(SET_DEFAULT VAR)
+    if(NOT DEFINED ${VAR})
+        set(${VAR} ${ARGN})
+    endif()
+    # remove these ctest configuration variables from the defines
+    # passed to the Kokkos configuration
+    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
+        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
+    endif()
+endmacro()
+
+# uses first arg -- useful for selecting via priority from multiple
+# potentially defined variables, e.g.:
+#
+#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
+#
+macro(SET_DEFAULT_ARG1 VAR)
+    if(NOT DEFINED ${VAR})
+        foreach(_ARG ${ARGN})
+            if(NOT "${_ARG}" STREQUAL "")
+                set(${VAR} ${_ARG})
+                break()
+            endif()
+        endforeach()
+    endif()
+    # remove these ctest configuration variables from the defines
+    # passed to the Kokkos configuration
+    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
+        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
+    endif()
+endmacro()
+
+# determine the default working directory
+if(NOT "$ENV{WORKSPACE}" STREQUAL "")
+    set(WORKING_DIR "$ENV{WORKSPACE}")
+else()
+    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
+endif()
+
+# determine the hostname
+execute_process(COMMAND hostname
+    OUTPUT_VARIABLE HOSTNAME
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
+
+# get the number of processors
+include(ProcessorCount)
+ProcessorCount(NUM_PROCESSORS)
+
+# find git
+find_package(Git QUIET)
+if(NOT GIT_EXECUTABLE)
+    unset(GIT_EXECUTABLE CACHE)
+    unset(GIT_EXECUTABLE)
+endif()
+
+function(EXECUTE_GIT_COMMAND VAR)
+    set(${VAR} "" PARENT_SCOPE)
+    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
+        OUTPUT_VARIABLE VAL
+        RESULT_VARIABLE RET
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        ERROR_QUIET)
+    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
+    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
+    if(RET EQUAL 0)
+        set(${VAR} "${VAL}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+# just gets the git branch name if available
+function(GET_GIT_BRANCH_NAME VAR)
+    execute_git_command(GIT_BRANCH branch --show-current)
+    set(_INVALID "%D" "HEAD")
+    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
+        execute_git_command(GIT_BRANCH show -s --format=%D)
+        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
+            execute_git_command(GIT_BRANCH --describe all)
+        endif()
+    endif()
+    #
+    if(GIT_BRANCH)
+        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
+        # just set it to last one via loop instead of wonky cmake index manip
+        foreach(_ITR ${_DESC})
+            set(GIT_BRANCH "${_ITR}")
+        endforeach()
+        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
+        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
+    endif()
+endfunction()
+
+# just gets the git branch name if available
+function(GET_GIT_AUTHOR_NAME VAR)
+    execute_git_command(GIT_AUTHOR show -s --format=%an)
+    if(GIT_AUTHOR)
+        string(LENGTH "${GIT_AUTHOR}" STRLEN)
+        # if the build name gets too long, this can cause submission errors
+        if(STRLEN GREATER 24)
+            # remove middle initial
+            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
+            # get first and sur name
+            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
+            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
+            if(S_NAME)
+                set(GIT_AUTHOR "${S_NAME}")
+            elseif(F_NAME)
+                set(GIT_AUTHOR "${F_NAME}")
+            endif()
+        endif()
+        # remove any spaces, quotes, periods, etc.
+        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
+        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
+        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
+    endif()
+endfunction()
+
+# get the name of the branch
+GET_GIT_BRANCH_NAME(GIT_BRANCH)
+# get the name of the author
+GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
+# author, prefer git method for consistency
+SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
+# SLUG == owner_name/repo_name
+SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
+# branch name
+SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
+# pull request number
+SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
+# get the event type, e.g. push, pull_request, api, cron, etc.
+SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
+
+if("${BRANCH}" STREQUAL "")
+    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
+    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
+endif()
+
+#----------------------------------------------------------------------------------------#
+#
+#   Set default values if not provided on command-line
+#
+#----------------------------------------------------------------------------------------#
+
+SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
+SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
+SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
+SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
+SET_DEFAULT(JOBS            1)                          # number of parallel ctests
+SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
+SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
+SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
+SET_DEFAULT(TARGET          "all")                      # build target
+SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
+                            "${HOSTNAME}")              # update site
+SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
+                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
+#
+#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
+#   '-I START,END,STRIDE'
+#
+SET_DEFAULT(START           "")
+SET_DEFAULT(END             "")
+SET_DEFAULT(STRIDE          "")
+SET_DEFAULT(INCLUDE         "")
+SET_DEFAULT(EXCLUDE         "")
+SET_DEFAULT(INCLUDE_LABEL   "")
+SET_DEFAULT(EXCLUDE_LABEL   "")
+SET_DEFAULT(PARALLEL_LEVEL  "")
+SET_DEFAULT(STOP_TIME       "")
+SET_DEFAULT(LABELS          "")
+SET_DEFAULT(NOTES           "")
+
+# default static build tag for Nightly
+set(BUILD_TAG "${BRANCH}")
+
+if(NOT BUILD_TYPE)
+    # default for kokkos if not specified
+    set(BUILD_TYPE "RelWithDebInfo")
+endif()
+
+# generate dynamic name if continuous or experimental model
+if(NOT "${MODEL}" STREQUAL "Nightly")
+    if(EVENT_TYPE AND PULL_REQUEST_NUM)
+        # e.g. pull_request/123
+        if(AUTHOR)
+            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
+        else()
+            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
+        endif()
+    elseif(SLUG)
+        # e.g. owner_name/repo_name
+        set(BUILD_TAG "${SLUG}")
+    elseif(AUTHOR)
+        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
+    endif()
+    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
+        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
+    endif()
+endif()
+
+# unnecessary
+string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
+string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
+
+message(STATUS "BUILD_TAG: ${BUILD_TAG}")
+
+set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
+
+# colons in build name create extra (empty) entries in CDash
+string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
+# unnecessary info
+string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
+# consistency
+string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
+string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
+# miscellaneous from missing fields
+string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
+string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
+
+# check binary directory
+if(EXISTS ${BINARY_DIR})
+    if(NOT IS_DIRECTORY "${BINARY_DIR}")
+        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
+    endif()
+    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
+    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
+        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
+    endif()
+endif()
+
+get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
+get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
+
+#----------------------------------------------------------------------------------------#
+#
+#   Generate the CTestConfig.cmake
+#
+#----------------------------------------------------------------------------------------#
+
+set(CONFIG_ARGS)
+foreach(_ARG ${KOKKOS_CMAKE_ARGS})
+    if(NOT "${${_ARG}}" STREQUAL "")
+        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
+        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
+            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
+                set(_ARG_TYPE "BOOL")
+            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
+                set(_ARG_TYPE "FILEPATH")
+            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
+                set(_ARG_TYPE "PATH")
+            elseif(NOT "${${_ARG}}" STREQUAL "")
+                set(_ARG_TYPE "STRING")
+            endif()
+        endif()
+        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
+    endif()
+endforeach()
+
+file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
+"
+set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
+${CONFIG_ARGS}
+")
+
+file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
+message(STATUS "Initial cache:\n${_CACHE_INFO}")
+
+# initialize the cache
+set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
+
+
+# generate the CTestConfig.cmake
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
+    ${BINARY_REALDIR}/CTestConfig.cmake
+    @ONLY)
+
+# copy/generate the dashboard script
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
+    ${BINARY_REALDIR}/KokkosCTest.cmake
+    @ONLY)
+
+# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
+execute_process(
+    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
+    WORKING_DIRECTORY   ${BINARY_REALDIR}
+    )
+
+#----------------------------------------------------------------------------------------#
+#
+#   Execute CTest
+#
+#----------------------------------------------------------------------------------------#
+
+message(STATUS "")
+message(STATUS "BUILD_NAME: ${BUILD_NAME}")
+message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
+message(STATUS "")
+
+# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
+string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
+
+execute_process(
+    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
+    RESULT_VARIABLE     RET
+    WORKING_DIRECTORY   ${BINARY_REALDIR}
+    )
+
+# ensure that any non-zero result variable gets propagated
+if(NOT RET EQUAL 0)
+    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
+endif()
diff --git a/packages/kokkos/cmake/KokkosCTest.cmake.in b/packages/kokkos/cmake/KokkosCTest.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..b6917f3cc1897aa6b1f0876560bb08c0c87b4c3a
--- /dev/null
+++ b/packages/kokkos/cmake/KokkosCTest.cmake.in
@@ -0,0 +1,261 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
+    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
+endif()
+
+include(ProcessorCount)
+ProcessorCount(CTEST_PROCESSOR_COUNT)
+
+cmake_policy(SET CMP0009 NEW)
+cmake_policy(SET CMP0011 NEW)
+
+# ---------------------------------------------------------------------------- #
+# -- Commands
+# ---------------------------------------------------------------------------- #
+find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
+find_program(CTEST_UNAME_COMMAND    NAMES uname)
+
+find_program(CTEST_BZR_COMMAND      NAMES bzr)
+find_program(CTEST_CVS_COMMAND      NAMES cvs)
+find_program(CTEST_GIT_COMMAND      NAMES git)
+find_program(CTEST_HG_COMMAND       NAMES hg)
+find_program(CTEST_P4_COMMAND       NAMES p4)
+find_program(CTEST_SVN_COMMAND      NAMES svn)
+
+find_program(VALGRIND_COMMAND       NAMES valgrind)
+find_program(GCOV_COMMAND           NAMES gcov)
+find_program(LCOV_COMMAND           NAMES llvm-cov)
+find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
+
+set(MEMORYCHECK_TYPE Valgrind)
+# set(MEMORYCHECK_TYPE Purify)
+# set(MEMORYCHECK_TYPE BoundsChecker)
+# set(MEMORYCHECK_TYPE ThreadSanitizer)
+# set(MEMORYCHECK_TYPE AddressSanitizer)
+# set(MEMORYCHECK_TYPE LeakSanitizer)
+# set(MEMORYCHECK_TYPE MemorySanitizer)
+# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
+set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
+
+# ---------------------------------------------------------------------------- #
+# -- Settings
+# ---------------------------------------------------------------------------- #
+## -- Process timeout in seconds
+set(CTEST_TIMEOUT           "7200")
+## -- Set output to English
+set(ENV{LC_MESSAGES}        "en_EN" )
+
+
+# ---------------------------------------------------------------------------- #
+# -- Copy ctest configuration file
+# ---------------------------------------------------------------------------- #
+macro(COPY_CTEST_CONFIG_FILES)
+
+    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
+
+        # if current directory is not binary or source directory
+        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
+           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
+
+            # if file exists in current directory
+            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
+                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
+                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
+            endif()
+
+        # if source and binary differ
+        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
+
+            # if file exists in source directory but not in binary directory
+            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
+               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
+                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
+                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
+            endif()
+
+        endif()
+    endforeach()
+
+endmacro()
+
+ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
+
+message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
+
+#-------------------------------------------------------------------------#
+# Start
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
+message(STATUS "")
+
+ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
+    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
+
+
+#-------------------------------------------------------------------------#
+# Config
+#
+copy_ctest_config_files()
+ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
+
+
+#-------------------------------------------------------------------------#
+# Update
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
+message(STATUS "")
+
+ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
+    RETURN_VALUE up_ret)
+
+
+#-------------------------------------------------------------------------#
+# Configure
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
+message(STATUS "")
+
+ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
+    SOURCE ${CTEST_SOURCE_DIRECTORY}
+    ${APPEND_CTEST}
+    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
+    RETURN_VALUE config_ret)
+
+
+#-------------------------------------------------------------------------#
+# Echo configure log bc Damien wants to delay merging this PR for eternity
+#
+file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
+# should only have one but loop just for safety
+foreach(_LOG ${_configure_log})
+    file(READ ${_LOG} _LOG_MESSAGE)
+    message(STATUS "Configure Log: ${_LOG}")
+    message(STATUS "\n${_LOG_MESSAGE}\n")
+endforeach()
+
+
+#-------------------------------------------------------------------------#
+# Build
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
+message(STATUS "")
+
+ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
+    ${APPEND_CTEST}
+    RETURN_VALUE build_ret)
+
+
+#-------------------------------------------------------------------------#
+# Echo build log bc Damien wants to delay merging this PR for eternity
+#
+file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
+# should only have one but loop just for safety
+foreach(_LOG ${_build_log})
+    file(READ ${_LOG} _LOG_MESSAGE)
+    message(STATUS "Build Log: ${_LOG}")
+    message(STATUS "\n${_LOG_MESSAGE}\n")
+endforeach()
+
+
+#-------------------------------------------------------------------------#
+# Test
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
+message(STATUS "")
+
+ctest_test(RETURN_VALUE test_ret
+    ${APPEND_CTEST}
+    ${START_CTEST}
+    ${END_CTEST}
+    ${STRIDE_CTEST}
+    ${INCLUDE_CTEST}
+    ${EXCLUDE_CTEST}
+    ${INCLUDE_LABEL_CTEST}
+    ${EXCLUDE_LABEL_CTEST}
+    ${PARALLEL_LEVEL_CTEST}
+    ${STOP_TIME_CTEST}
+    SCHEDULE_RANDOM OFF)
+
+
+#-------------------------------------------------------------------------#
+# Coverage
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
+message(STATUS "")
+
+execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
+    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
+    ERROR_QUIET)
+
+ctest_coverage(${APPEND_CTEST}
+    ${CTEST_COVERAGE_LABELS}
+    RETURN_VALUE cov_ret)
+
+
+#-------------------------------------------------------------------------#
+# MemCheck
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
+message(STATUS "")
+
+ctest_memcheck(RETURN_VALUE mem_ret
+    ${APPEND_CTEST}
+    ${START_CTEST}
+    ${END_CTEST}
+    ${STRIDE_CTEST}
+    ${INCLUDE_CTEST}
+    ${EXCLUDE_CTEST}
+    ${INCLUDE_LABEL_CTEST}
+    ${EXCLUDE_LABEL_CTEST}
+    ${PARALLEL_LEVEL_CTEST})
+
+
+#-------------------------------------------------------------------------#
+# Submit
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
+message(STATUS "")
+
+file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
+foreach(_FILE ${NOTE_FILES})
+    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
+    include("${_FILE}")
+endforeach()
+
+# capture submit error so it doesn't fail because of a submission error
+ctest_submit(RETURN_VALUE submit_ret
+    RETRY_COUNT 2
+    RETRY_DELAY 10
+    CAPTURE_CMAKE_ERROR submit_err)
+
+#-------------------------------------------------------------------------#
+# Submit
+#
+message(STATUS "")
+message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
+message(STATUS "")
+
+
+#-------------------------------------------------------------------------#
+# Non-zero exit codes for important errors
+#
+if(NOT config_ret EQUAL 0)
+    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
+endif()
+
+if(NOT build_ret EQUAL 0)
+    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
+endif()
+
+if(NOT test_ret EQUAL 0)
+    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
+endif()
diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in
index 9fbd22ee5c47899f5b625b852dd2858c894e2053..44a8fcd9c319326399ab19146f8cf213dbb51f64 100644
--- a/packages/kokkos/cmake/KokkosConfig.cmake.in
+++ b/packages/kokkos/cmake/KokkosConfig.cmake.in
@@ -19,17 +19,44 @@ INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake")
 INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake")
 UNSET(Kokkos_CMAKE_DIR)
 
-# if CUDA was enabled and separable compilation was specified, e.g.
-#   find_package(Kokkos COMPONENTS separable_compilation)
-# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK
-IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS)
+# check for conflicts
+IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS AND
+    "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS)
+    MESSAGE(STATUS "'launch_compiler' implies global redirection of targets depending on Kokkos to appropriate compiler.")
+    MESSAGE(STATUS "'separable_compilation' implies explicitly defining where redirection occurs via 'kokkos_compilation(PROJECT|TARGET|SOURCE|DIRECTORY ...)'")
+    MESSAGE(FATAL_ERROR "Conflicting COMPONENTS: 'launch_compiler' and 'separable_compilation'")
+ENDIF()
+
+IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS)
+    #
+    # if find_package(Kokkos COMPONENTS launch_compiler) then rely on the
+    # RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK to always redirect to the
+    # appropriate compiler for Kokkos
+    #
+
+    MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos")
+    kokkos_compilation(
+        GLOBAL
+        CHECK_CUDA_COMPILES)
+
+ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS)
+    #
+    # if CUDA was enabled, separable compilation was not specified, and current compiler
+    # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and
+    # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation.
+    # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present,
+    # otherwise, the original command will be executed
+    #
+
     # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper
     kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER})
-    # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK
-    IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND
-        (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER))
-        MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper")
+
+    # if not nvcc_wrapper and Kokkos_LAUNCH_COMPILER was not set to OFF
+    IF(NOT IS_NVCC AND (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER))
+        MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos")
         kokkos_compilation(GLOBAL)
     ENDIF()
-    UNSET(IS_NVCC) # be mindful of the environment, pollution is bad
+
+    # be mindful of the environment, pollution is bad
+    UNSET(IS_NVCC)
 ENDIF()
diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
index 42c755c2157f67baa3c88af05172ac450651f7e2..ab93e65afe97ab9be9295312e6cd879a1aff6b27 100644
--- a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
+++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
@@ -3,6 +3,7 @@ SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@)
 SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@)
 SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@)
 SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
+SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@")
 
 # These are needed by KokkosKernels
 FOREACH(DEV ${Kokkos_DEVICES})
@@ -13,13 +14,13 @@ IF(NOT Kokkos_FIND_QUIETLY)
   MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}")
 ENDIF()
 
-IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0")
-  #If we are building CUDA, we have tricked CMake because we declare a CXX project
-  #If the default C++ standard for a given compiler matches the requested
-  #standard, then CMake just omits the -std flag in later versions of CMake
-  #This breaks CUDA compilation (CUDA compiler can have a different default
-  #-std then the underlying host compiler by itself). Setting this variable
-  #forces CMake to always add the -std flag even if it thinks it doesn't need it
+IF (Kokkos_ENABLE_CUDA)
+  # If we are building CUDA, we have tricked CMake because we declare a CXX project
+  # If the default C++ standard for a given compiler matches the requested
+  # standard, then CMake just omits the -std flag in later versions of CMake
+  # This breaks CUDA compilation (CUDA compiler can have a different default
+  # -std then the underlying host compiler by itself). Setting this variable
+  # forces CMake to always add the -std flag even if it thinks it doesn't need it
   SET(CMAKE_CXX_STANDARD_DEFAULT 98 CACHE INTERNAL "" FORCE)
 ENDIF()
 
@@ -90,7 +91,88 @@ function(kokkos_check)
   endif()
 endfunction()
 
-# this function is provided to easily select which files use nvcc_wrapper:
+# A test to check whether a downstream project set the C++ compiler to NVCC or not
+# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON
+FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER)
+    # Check if the compiler is nvcc (which really means nvcc_wrapper).
+    EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version
+                    OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
+                    OUTPUT_STRIP_TRAILING_WHITESPACE
+                    RESULT_VARIABLE RET)
+    # something went wrong
+    IF(RET GREATER 0)
+        SET(${VAR} false PARENT_SCOPE)
+    ELSE()
+        STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
+        STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC)
+        STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
+        IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1)
+            SET(${VAR} true PARENT_SCOPE)
+        ELSE()
+            SET(${VAR} false PARENT_SCOPE)
+        ENDIF()
+    ENDIF()
+ENDFUNCTION()
+
+# this function checks whether the current CXX compiler supports building CUDA
+FUNCTION(kokkos_cxx_compiler_cuda_test _VAR _COMPILER)
+
+    FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu
+"
+#include <cuda.h>
+#include <cstdlib>
+
+__global__
+void kernel(int sz, double* data)
+{
+    int _beg = blockIdx.x * blockDim.x + threadIdx.x;
+    for(int i = _beg; i < sz; ++i)
+        data[i] += static_cast<double>(i);
+}
+
+int main()
+{
+    double* data = NULL;
+    int blocks = 64;
+    int grids = 64;
+    int ret = cudaMalloc(&data, blocks * grids * sizeof(double));
+    if(ret != cudaSuccess)
+        return EXIT_FAILURE;
+    kernel<<<grids, blocks>>>(blocks * grids, data);
+    cudaDeviceSynchronize();
+    return EXIT_SUCCESS;
+}
+")
+
+    # save the command for debugging
+    SET(_COMMANDS "${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu")
+
+    # use execute_process instead of try compile because we want to set custom compiler
+    EXECUTE_PROCESS(COMMAND ${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu
+        RESULT_VARIABLE     _RET
+        WORKING_DIRECTORY   ${PROJECT_BINARY_DIR}/compile_tests
+        TIMEOUT             15
+        OUTPUT_QUIET
+        ERROR_QUIET)
+
+    IF(NOT _RET EQUAL 0)
+        # save the command for debugging
+        SET(_COMMANDS "${_COMMAND}\n${_COMPILER} --cuda-gpu-arch=sm_35 ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu")
+        # try the compile test again with clang arguments
+        EXECUTE_PROCESS(COMMAND ${_COMPILER} --cuda-gpu-arch=sm_35 -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu
+            RESULT_VARIABLE     _RET
+            WORKING_DIRECTORY   ${PROJECT_BINARY_DIR}/compile_tests
+            TIMEOUT             15
+            OUTPUT_QUIET
+            ERROR_QUIET)
+    ENDIF()
+
+    SET(${_VAR}_COMMANDS "${_COMMANDS}" PARENT_SCOPE)
+    SET(${_VAR} ${_RET} PARENT_SCOPE)
+ENDFUNCTION()
+
+# this function is provided to easily select which files use the same compiler as Kokkos
+# when it was installed (or nvcc_wrapper):
 #
 #       GLOBAL      --> all files
 #       TARGET      --> all files in a target
@@ -98,8 +180,21 @@ endfunction()
 #       DIRECTORY   --> all files in directory
 #       PROJECT     --> all files/targets in a project/subproject
 #
+# Use the COMPILER argument to specify a compiler, if needed. By default, it will
+# set the values to ${Kokkos_CXX_COMPILER} unless Kokkos_ENABLE_CUDA=ON and
+# Kokkos_CXX_COMPILER_ID is NVIDIA, then it will set it to nvcc_wrapper
+#
+# Use CHECK_CUDA_COMPILES to run a check when CUDA is enabled
+#
 FUNCTION(kokkos_compilation)
-    CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN})
+    CMAKE_PARSE_ARGUMENTS(COMP
+        "GLOBAL;PROJECT;CHECK_CUDA_COMPILES"
+        "COMPILER"
+        "DIRECTORY;TARGET;SOURCE;COMMAND_PREFIX"
+        ${ARGN})
+
+    # if built w/o CUDA support, we want to basically make this a no-op
+    SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@)
 
     # search relative first and then absolute
     SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@")
@@ -115,10 +210,52 @@ FUNCTION(kokkos_compilation)
         MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'")
     ENDIF()
 
+    # if COMPILER was not specified, assume Kokkos_CXX_COMPILER
+    IF(NOT COMP_COMPILER)
+        SET(COMP_COMPILER ${Kokkos_CXX_COMPILER})
+        IF(_Kokkos_ENABLE_CUDA AND Kokkos_CXX_COMPILER_ID STREQUAL NVIDIA)
+            # find nvcc_wrapper
+            FIND_PROGRAM(Kokkos_NVCC_WRAPPER
+                NAMES           nvcc_wrapper
+                HINTS           ${_HINTS}
+                PATHS           ${_HINTS}
+                PATH_SUFFIXES   bin)
+            # fatal if we can't nvcc_wrapper
+            IF(NOT Kokkos_NVCC_WRAPPER)
+                MESSAGE(FATAL_ERROR "Kokkos could not find nvcc_wrapper. Please set '-DKokkos_NVCC_WRAPPER=/path/to/nvcc_wrapper'")
+            ENDIF()
+            SET(COMP_COMPILER ${Kokkos_NVCC_WRAPPER})
+        ENDIF()
+    ENDIF()
+
+    # check that the original compiler still exists!
+    IF(NOT EXISTS ${COMP_COMPILER})
+        MESSAGE(FATAL_ERROR "Kokkos could not find original compiler: '${COMP_COMPILER}'")
+    ENDIF()
+
+    # try to ensure that compiling cuda code works!
+    IF(_Kokkos_ENABLE_CUDA AND COMP_CHECK_CUDA_COMPILES)
+
+        # this may fail if kokkos_compiler launcher was used during install
+        kokkos_cxx_compiler_cuda_test(_COMPILES_CUDA
+            ${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER})
+
+        # if above failed, throw an error
+        IF(NOT _COMPILES_CUDA)
+            MESSAGE(FATAL_ERROR "kokkos_cxx_compiler_cuda_test failed! Test commands:\n${_COMPILES_CUDA_COMMANDS}")
+        ENDIF()
+    ENDIF()
+
+    IF(COMP_COMMAND_PREFIX)
+        SET(_PREFIX "${COMP_COMMAND_PREFIX}")
+        STRING(REPLACE ";" " " _PREFIX "${COMP_COMMAND_PREFIX}")
+        SET(Kokkos_COMPILER_LAUNCHER "${_PREFIX} ${Kokkos_COMPILE_LAUNCHER}")
+    ENDIF()
+
     IF(COMP_GLOBAL)
         # if global, don't bother setting others
-        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
-        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
+        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}")
+        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}")
     ELSE()
         FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE)
             # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...)
@@ -128,34 +265,10 @@ FUNCTION(kokkos_compilation)
             ENDIF()
             # set the properties if defined
             IF(COMP_${_TYPE})
-                # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}")
-                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
-                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
+                # MESSAGE(STATUS "Using ${COMP_COMPILER} :: ${_TYPE} :: ${COMP_${_TYPE}}")
+                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}")
+                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}")
             ENDIF()
         ENDFOREACH()
     ENDIF()
 ENDFUNCTION()
-
-# A test to check whether a downstream project set the C++ compiler to NVCC or not
-# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON
-FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER)
-    # Check if the compiler is nvcc (which really means nvcc_wrapper).
-    EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version
-                    OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
-                    OUTPUT_STRIP_TRAILING_WHITESPACE
-                    RESULT_VARIABLE RET)
-    # something went wrong
-    IF(RET GREATER 0)
-        SET(${VAR} false PARENT_SCOPE)
-    ELSE()
-        STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
-        STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC)
-        STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
-        IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1)
-            SET(${VAR} true PARENT_SCOPE)
-        ELSE()
-            SET(${VAR} false PARENT_SCOPE)
-        ENDIF()
-    ENDIF()
-ENDFUNCTION()
-
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index 0259fe69d50c3f47fa090b9b221df8253b425c5c..fbfae3711ec14573b4c3067aea4a8625d6b2ad8c 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -78,6 +78,7 @@
 #cmakedefine KOKKOS_ARCH_POWER7
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@@ -95,5 +96,8 @@
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
 #cmakedefine KOKKOS_ARCH_AMPERE80
+#cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
+
+#cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF
diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake
index d620a71d369888fd5adecabde14119fbff63d6c0..eda5541f7c0633a868285190e9a4c39c275adf6b 100644
--- a/packages/kokkos/cmake/Modules/CudaToolkit.cmake
+++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake
@@ -481,76 +481,6 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILE
   unset(cuda_dir)
 endif()
 
-IF(CMAKE_VERSION VERSION_LESS "3.12.0")
-  function(import_target_link_libraries target)
-    cmake_parse_arguments(HACK
-      "SYSTEM;INTERFACE;PUBLIC"
-      ""
-      ""
-      ${ARGN}
-    )
-    get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES)
-    if (LIBS)
-      list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS})
-    else()
-      set(LIBS ${HACK_UNPARSED_ARGUMENTS})
-    endif()
-    set_target_properties(${target} PROPERTIES
-      INTERFACE_LINK_LIBRARIES "${LIBS}")
-  endfunction()
-ELSE()
-  function(import_target_link_libraries)
-    target_link_libraries(${ARGN})
-  endfunction()
-ENDIF()
-
-IF(CMAKE_VERSION VERSION_LESS "3.13.0")
-  function(import_target_link_directories target)
-    cmake_parse_arguments(HACK
-      "SYSTEM;INTERFACE;PUBLIC"
-      ""
-      ""
-      ${ARGN}
-    )
-    get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES)
-    if (LINK_LIBS) #could be not-found
-      set(LINK_LIBS_LIST ${LINK_LIBS})
-    endif()
-    foreach(LIB ${HACK_UNPARSED_ARGUMENTS})
-      list(APPEND LINK_LIBS_LIST -L${LIB})
-    endforeach()
-    set_target_properties(${target} PROPERTIES
-      INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}")
-  endfunction()
-ELSE()
-  function(import_target_link_directories)
-    target_link_directories(${ARGN})
-  endfunction()
-ENDIF()
-
-IF(CMAKE_VERSION VERSION_LESS "3.12.0")
-  function(import_target_include_directories target)
-    cmake_parse_arguments(HACK
-      "SYSTEM;INTERFACE;PUBLIC"
-      ""
-      ""
-      ${ARGN}
-    )
-    get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES)
-    if (INCLUDE_DIRS)
-      list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
-    else()
-      set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
-    endif()
-    set_target_properties(${target} PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}")
-  endfunction()
-ELSE()
-  function(import_target_include_directories)
-    target_include_directories(${ARGN})
-  endfunction()
-ENDIF()
-
 # Try language- or user-provided path first.
 if(CUDAToolkit_BIN_DIR)
   find_program(CUDAToolkit_NVCC_EXECUTABLE
@@ -854,11 +784,11 @@ if(CUDAToolkit_FOUND)
 
     if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
       add_library(CUDA::${lib_name} IMPORTED INTERFACE)
-      import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
-      import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
       foreach(dep ${arg_DEPS})
         if(TARGET CUDA::${dep})
-          import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
         endif()
       endforeach()
     endif()
@@ -866,8 +796,8 @@ if(CUDAToolkit_FOUND)
 
   if(NOT TARGET CUDA::toolkit)
     add_library(CUDA::toolkit IMPORTED INTERFACE)
-    import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
-    import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
   endif()
 
   _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
@@ -882,11 +812,11 @@ if(CUDAToolkit_FOUND)
      AND TARGET CUDA::cudart_static)
 
     add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
-    import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
+    target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
 
     if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
       find_package(Threads REQUIRED)
-      import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
     endif()
 
     if(UNIX AND NOT APPLE)
@@ -896,7 +826,7 @@ if(CUDAToolkit_FOUND)
       if(NOT CUDAToolkit_rt_LIBRARY)
         message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
       else()
-        import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
       endif()
     endif()
   endif()
diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
index a1072a60c6182413768941b6f9d4537d7df74f61..8d58d96415808499dc39d44ad3600f5f5a64368e 100644
--- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
@@ -25,7 +25,7 @@ IF (TARGET CUDA::cuda_driver)
   SET(FOUND_CUDA_DRIVER TRUE)
   KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver)
 ELSE()
-  SET(FOUND_CUDA_DRIVVER FALSE)
+  SET(FOUND_CUDA_DRIVER FALSE)
 ENDIF()
 
 include(FindPackageHandleStandardArgs)
diff --git a/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake
index 1d154e29afff16479663d9c8d495f81142e5cf82..a743fca0e45290cf7ad80e3b022e7f66a34947fa 100644
--- a/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake
@@ -10,7 +10,7 @@ TRY_COMPILE(KOKKOS_HAS_PTHREAD_ARG
 # ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION}
 
 INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(PTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLPTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG)
 #Only create the TPL if we succeed
 IF (KOKKOS_HAS_PTHREAD_ARG)
   KOKKOS_CREATE_IMPORTED_TPL(PTHREAD
diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..512ad6ceb283dcd27f8db1dfb45f045f998d7875
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake
@@ -0,0 +1,11 @@
+include(FindPackageHandleStandardArgs)
+
+FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib)
+FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib)
+
+find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY)
+
+kokkos_create_imported_tpl(ROCM INTERFACE
+  LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY}
+  COMPILE_DEFINITIONS __HIP_ROCclr__
+)
diff --git a/packages/kokkos/cmake/compile_tests/cplusplus14.cpp b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52ec9885ec3ed5f4e7c0871f59de3d651df33efe
--- /dev/null
+++ b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp
@@ -0,0 +1,8 @@
+#include <type_traits>
+
+int main() {
+  // _t versions of type traits were added in C++14
+  std::remove_cv_t<int> i = 0;
+
+  return i;
+}
diff --git a/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc
index 48c01c070cb8d1db5542a4da4e4d3fbd51e008be..a26ac5af4bf2dee2c26f1ee20c6c500fe465bf9f 100644
--- a/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -72,6 +72,7 @@ int main() {
     case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break;
     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
+    case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
     default:
       std::cout << "Compute capability " << compute_capability
                 << " is not supported" << std::endl;
diff --git a/packages/kokkos/cmake/compile_tests/pthread.cpp b/packages/kokkos/cmake/compile_tests/pthread.cpp
index 92310da0293704a121e265766dbe2979fc66513e..3f83bf6a5f7fe399fc4a44547792e738177facfb 100644
--- a/packages/kokkos/cmake/compile_tests/pthread.cpp
+++ b/packages/kokkos/cmake/compile_tests/pthread.cpp
@@ -2,7 +2,7 @@
 
 void* kokkos_test(void* args) { return args; }
 
-int main(void) {
+int main() {
   pthread_t thread;
   /* Use NULL to avoid C++11. Some compilers
      do not have C++11 by default.  Forcing C++11
diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake
index 2e82a462356b5520b1f3edcfec1635fc0f6f99cc..fbd6745a602caa8976958d10cf7d9b4c1fa3c471 100644
--- a/packages/kokkos/cmake/fake_tribits.cmake
+++ b/packages/kokkos/cmake/fake_tribits.cmake
@@ -81,10 +81,16 @@ ENDMACRO()
 FUNCTION(KOKKOS_ADD_TEST)
   if (KOKKOS_HAS_TRILINOS)
     CMAKE_PARSE_ARGUMENTS(TEST
-      ""
+      "SKIP_TRIBITS"
       "EXE;NAME;TOOL"
       "ARGS"
       ${ARGN})
+
+    IF(TEST_SKIP_TRIBITS)
+      MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits")
+      RETURN()
+    ENDIF()
+
     IF(TEST_EXE)
       SET(EXE_ROOT ${TEST_EXE})
     ELSE()
@@ -119,11 +125,10 @@ FUNCTION(KOKKOS_ADD_TEST)
     endif()
   else()
     CMAKE_PARSE_ARGUMENTS(TEST
-      "WILL_FAIL"
+      "WILL_FAIL;SKIP_TRIBITS"
       "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL"
       "CATEGORIES;ARGS"
       ${ARGN})
-    SET(TESTS_ADDED)
     # To match Tribits, we should always be receiving
     # the root names of exes/libs
     IF(TEST_EXE)
@@ -135,48 +140,27 @@ FUNCTION(KOKKOS_ADD_TEST)
     # These should be the full target name
     SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME})
     SET(EXE ${PACKAGE_NAME}_${EXE_ROOT})
-    IF (TEST_ARGS)
-      SET(TEST_NUMBER 0)
-      FOREACH (ARG_STR ${TEST_ARGS})
-        # This is passed as a single string blob to match TriBITS behavior
-        # We need this to be turned into a list
-        STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR})
-        IF(WIN32)
-          ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                   COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST})
-        ELSE()
-          ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST})
-        ENDIF()
-        LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}")
-        MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1")
-      ENDFOREACH()
+    IF(WIN32)
+      ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH}
+        COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS})
     ELSE()
-      IF(WIN32)
-        ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH}
-                 COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX})
-      ELSE()
-        ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE})
-      ENDIF()
-      LIST(APPEND TESTS_ADDED "${TEST_NAME}")
+      ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS})
+    ENDIF()
+    IF(TEST_WILL_FAIL)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL})
+    ENDIF()
+    IF(TEST_FAIL_REGULAR_EXPRESSION)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION})
+    ENDIF()
+    IF(TEST_PASS_REGULAR_EXPRESSION)
+      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
+    ENDIF()
+    IF(TEST_TOOL)
+      ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
+      SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
     ENDIF()
-
-    FOREACH(TEST_NAME ${TESTS_ADDED})
-      IF(TEST_WILL_FAIL)
-        SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL})
-      ENDIF()
-      IF(TEST_FAIL_REGULAR_EXPRESSION)
-        SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION})
-      ENDIF()
-      IF(TEST_PASS_REGULAR_EXPRESSION)
-        SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
-      ENDIF()
-      if(TEST_TOOL)
-        add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
-        set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
-      endif()
-    ENDFOREACH()
     VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS})
-  endif()
+  ENDIF()
 ENDFUNCTION()
 
 FUNCTION(KOKKOS_ADD_ADVANCED_TEST)
@@ -326,14 +310,6 @@ ENDIF()
 ENDFUNCTION()
 
 
-FUNCTION(KOKKOS_TARGET_COMPILE_DEFINITIONS)
-  IF (KOKKOS_HAS_TRILINOS)
-    TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN})
-  ELSE()
-    TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN})
-  ENDIF()
-ENDFUNCTION()
-
 FUNCTION(KOKKOS_INCLUDE_DIRECTORIES)
 IF(KOKKOS_HAS_TRILINOS)
   TRIBITS_INCLUDE_DIRECTORIES(${ARGN})
@@ -350,10 +326,6 @@ ENDIF()
 ENDFUNCTION()
 
 
-MACRO(KOKKOS_ADD_COMPILE_OPTIONS)
-ADD_COMPILE_OPTIONS(${ARGN})
-ENDMACRO()
-
 MACRO(PRINTALL match)
 get_cmake_property(_variableNames VARIABLES)
 list (SORT _variableNames)
@@ -376,4 +348,3 @@ FUNCTION(GLOBAL_APPEND VARNAME)
   LIST(APPEND TEMP ${ARGN})
   GLOBAL_SET(${VARNAME} ${TEMP})
 ENDFUNCTION()
-
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index 53aaf7dccf169a4bfd0dff0e93bf619a4f1f8bee..ec18e70a36a34dbecc305f978e0d7b84c482da37 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -35,7 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80          HOST "ARMv8.0 Compatible CPU")
 KOKKOS_ARCH_OPTION(ARMV81          HOST "ARMv8.1 Compatible CPU")
 KOKKOS_ARCH_OPTION(ARMV8_THUNDERX  HOST "ARMv8 Cavium ThunderX CPU")
 KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU")
-KOKKOS_ARCH_OPTION(A64FX           HOST "ARMv8.2 with SVE Suport")
+KOKKOS_ARCH_OPTION(A64FX           HOST "ARMv8.2 with SVE Support")
 KOKKOS_ARCH_OPTION(WSM             HOST "Intel Westmere CPU")
 KOKKOS_ARCH_OPTION(SNB             HOST "Intel Sandy/Ivy Bridge CPUs")
 KOKKOS_ARCH_OPTION(HSW             HOST "Intel Haswell CPUs")
@@ -60,11 +60,12 @@ KOKKOS_ARCH_OPTION(VOLTA70         GPU  "NVIDIA Volta generation CC 7.0")
 KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2")
 KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5")
 KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0")
+KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6")
 KOKKOS_ARCH_OPTION(ZEN             HOST "AMD Zen architecture")
 KOKKOS_ARCH_OPTION(ZEN2            HOST "AMD Zen2 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
-KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU")
+KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
 
 
@@ -141,8 +142,16 @@ ENDIF()
 #------------------------------- KOKKOS_HIP_OPTIONS ---------------------------
 #clear anything that might be in the cache
 GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
-  SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
+IF(KOKKOS_ENABLE_HIP)
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
+  ELSE()
+    SET(AMDGPU_ARCH_FLAG "--offload-arch")
+    GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip)
+    IF(DEFINED ENV{ROCM_PATH})
+      GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH})
+    ENDIF()
+  ENDIF()
 ENDIF()
 
 
@@ -183,6 +192,8 @@ ENDIF()
 IF (KOKKOS_ARCH_A64FX)
   COMPILER_SPECIFIC_FLAGS(
     DEFAULT -march=armv8.2-a+sve
+    Clang -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC -march=armv8.2-a+sve -msve-vector-bits=512
   )
 ENDIF()
 
@@ -309,7 +320,7 @@ IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9)
   SET(KOKKOS_USE_ISA_POWERPCLE ON)
 ENDIF()
 
-IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
+IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
   COMPILER_SPECIFIC_FLAGS(
     Clang  -fcuda-rdc
     NVIDIA --relocatable-device-code=true
@@ -333,8 +344,8 @@ ENDIF()
 
 #Right now we cannot get the compiler ID when cross-compiling, so just check
 #that HIP is enabled
-IF (Kokkos_ENABLE_HIP)
-  IF (Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
+IF (KOKKOS_ENABLE_HIP)
+  IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
     COMPILER_SPECIFIC_FLAGS(
       DEFAULT -fgpu-rdc
     )
@@ -345,8 +356,7 @@ IF (Kokkos_ENABLE_HIP)
   ENDIF()
 ENDIF()
 
-
-IF (Kokkos_ENABLE_SYCL)
+IF (KOKKOS_ENABLE_SYCL)
   COMPILER_SPECIFIC_FLAGS(
     DEFAULT -fsycl
   )
@@ -363,7 +373,7 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
       MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.")
     ENDIF()
     SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE)
-    IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET)
+    IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL)
       MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
       UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE)
     ELSE()
@@ -396,6 +406,7 @@ CHECK_CUDA_ARCH(VOLTA70   sm_70)
 CHECK_CUDA_ARCH(VOLTA72   sm_72)
 CHECK_CUDA_ARCH(TURING75  sm_75)
 CHECK_CUDA_ARCH(AMPERE80  sm_80)
+CHECK_CUDA_ARCH(AMPERE86  sm_86)
 
 SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -405,12 +416,12 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
     ENDIF()
     SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE)
     IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET)
-      MESSAGE(WARNING "Given HIP arch ${ARCH}, but Kokkos_ENABLE_AMDGPU and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
+      MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
       UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE)
     ELSE()
       SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE)
       GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}")
-      IF(KOKKOS_ENABLE_HIP)
+      IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE)
         GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}")
       ENDIF()
     ENDIF()
@@ -451,6 +462,24 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
   ENDIF()
 ENDIF()
 
+IF (KOKKOS_ENABLE_SYCL)
+  IF(CUDA_ARCH_ALREADY_SPECIFIED)
+    IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS)
+      COMPILER_SPECIFIC_FLAGS(
+        DEFAULT -fsycl-targets=nvptx64-nvidia-cuda-sycldevice
+      )
+      # FIXME_SYCL The CUDA backend doesn't support printf yet.
+      GLOBAL_SET(KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF ON)
+    ELSE()
+      MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!")
+    ENDIF()
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+    )
+  ENDIF()
+ENDIF()
+
 IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED)
   # Try to autodetect the CUDA Compute Capability by asking the device
   SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir)
@@ -464,6 +493,43 @@ IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED)
     ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc
     COMPILE_DEFINITIONS -DSM_ONLY
     RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY)
+
+  # if user is using kokkos_compiler_launcher, above will fail.
+  IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0)
+    # check to see if CUDA is not already enabled (may happen when Kokkos is subproject)
+    GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
+    # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough
+    IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES)
+      # make sure the user knows that we aren't using CUDA compiler for anything else
+      MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...")
+      INCLUDE(CheckLanguage)
+      CHECK_LANGUAGE(CUDA)
+      IF(CMAKE_CUDA_COMPILER)
+        ENABLE_LANGUAGE(CUDA)
+      ELSE()
+        MESSAGE(STATUS "CUDA language could not be enabled")
+      ENDIF()
+    ENDIF()
+
+    # if CUDA was enabled, this will be defined
+    IF(CMAKE_CUDA_COMPILER)
+      # copy our test to .cu so cmake compiles as CUDA
+      CONFIGURE_FILE(
+        ${PROJECT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc
+        ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu
+        COPYONLY
+      )
+      # run test again
+      TRY_RUN(
+        _RESULT
+        _COMPILE_RESULT
+        ${_BINARY_TEST_DIR}
+        ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu
+        COMPILE_DEFINITIONS -DSM_ONLY
+        RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY)
+    ENDIF()
+  ENDIF()
+
   LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX)
   IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1)
     MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}")
@@ -500,7 +566,7 @@ IF (KOKKOS_ENABLE_CUDA)
     SET(KOKKOS_ARCH_VOLTA ON)
   ENDIF()
 
-  IF (KOKKOS_ARCH_AMPERE80)
+  IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
     SET(KOKKOS_ARCH_AMPERE ON)
   ENDIF()
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index e6600161f9fe1b205fe4b481bc1af4d91a00c3e1..4434d6928f46429ad7525c944a0c1c6c351c4cdd 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -27,6 +27,12 @@ IF(Kokkos_ENABLE_CUDA)
       PATHS           ${PROJECT_SOURCE_DIR}
       PATH_SUFFIXES   bin)
 
+  FIND_PROGRAM(Kokkos_NVCC_WRAPPER
+      NAMES           nvcc_wrapper
+      HINTS           ${PROJECT_SOURCE_DIR}
+      PATHS           ${PROJECT_SOURCE_DIR}
+      PATH_SUFFIXES   bin)
+
   # check if compiler was set to nvcc_wrapper
   kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER})
   # if launcher was found and nvcc_wrapper was not specified as
@@ -37,7 +43,7 @@ IF(Kokkos_ENABLE_CUDA)
     # if the second argument matches the C++ compiler, it forwards the rest of the
     # args to nvcc_wrapper
     kokkos_internal_have_compiler_nvcc(
-      ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE)
+      ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE)
     SET(INTERNAL_USE_COMPILER_LAUNCHER true)
   ENDIF()
 ENDIF()
@@ -55,32 +61,7 @@ IF(INTERNAL_HAVE_COMPILER_NVCC)
   SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE)
   MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}")
   IF(INTERNAL_USE_COMPILER_LAUNCHER)
-    IF(Kokkos_LAUNCH_COMPILER_INFO)
-        GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME)
-        # does not have STATUS intentionally
-        MESSAGE("")
-        MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):")
-        MESSAGE("  - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.")
-        MESSAGE("  - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON")
-        MESSAGE("    by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).")
-        MESSAGE("  - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.")
-        MESSAGE("  - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.")
-        MESSAGE("  - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.")
-        MESSAGE("  - kokkos_compiler_launcher is available to downstream projects as well.")
-        MESSAGE("    - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'")
-        MESSAGE("    - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled")
-        MESSAGE("      - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'")
-        MESSAGE("    - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation")
-        MESSAGE("      - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied")
-        MESSAGE("      - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis")
-        MESSAGE("        - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject")
-        MESSAGE("        - 'kokkos_compilation(TARGET <TARGET> [<TARGETS>...])' will apply the compiler transformation to the specified target(s)")
-        MESSAGE("        - 'kokkos_compilation(SOURCE <SOURCE> [<SOURCES>...])' will apply the compiler transformation to the specified source file(s)")
-        MESSAGE("        - 'kokkos_compilation(DIRECTORY <DIR> [<DIRS>...])' will apply the compiler transformation to the specified directories")
-        MESSAGE("")
-    ELSE()
-        MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.")
-    ENDIF()
+    MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...")
     kokkos_compilation(GLOBAL)
   ENDIF()
 ENDIF()
@@ -92,7 +73,11 @@ IF(Kokkos_ENABLE_HIP)
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
 
   STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
-  SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE)
+
+  STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP)
+  IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1)
+    SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE)
+  ENDIF()
 
   STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+"
          TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE})
@@ -103,8 +88,7 @@ ENDIF()
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   # The Cray compiler reports as Clang to most versions of CMake
   EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
-                  COMMAND grep Cray
-                  COMMAND wc -l
+                  COMMAND grep -c Cray
                   OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
   IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang
@@ -112,8 +96,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   ENDIF()
   # The clang based Intel compiler reports as Clang to most versions of CMake
   EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
-                  COMMAND grep icpx
-                  COMMAND wc -l
+                  COMMAND grep -c "DPC++\\|icpx"
                   OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
   IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang
@@ -174,7 +157,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
   SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
-ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
+ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
   IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake
index 3962c4b16efbcf240b52e2463ea575d39b844a1d..a84ac2b63027e7112cb3a7b76e5e9a7b8fc892e3 100644
--- a/packages/kokkos/cmake/kokkos_corner_cases.cmake
+++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake
@@ -49,11 +49,14 @@ ENDIF()
 
 IF (KOKKOS_CXX_STANDARD STREQUAL 17)
   IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7)
-    MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.")
+    MESSAGE(FATAL_ERROR "You have requested C++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC < 7 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.")
   ENDIF()
 
   IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11)
-    MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.")
+    MESSAGE(FATAL_ERROR "You have requested C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.")
+  ENDIF()
+  IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR)
+    MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON with C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs. See https://github.com/kokkos/kokkos/issues/3496")
   ENDIF()
 ENDIF()
 
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index 41ee10a8a05c6909374be1c704b03997bb8f8618..445dad47ce561979037bf5b1622413ddda05f3b3 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -48,9 +48,6 @@ IF(KOKKOS_ENABLE_OPENMP)
   IF(KOKKOS_CLANG_IS_CRAY)
     SET(ClangOpenMPFlag -fopenmp)
   ENDIF()
-  IF(KOKKOS_CLANG_IS_INTEL)
-    SET(ClangOpenMPFlag -fiopenmp)
-  ENDIF()
   IF(KOKKOS_COMPILER_CLANG_MSVC)
     #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag
     SET(ClangOpenMPFlag /clang:-fopenmp=libomp)
@@ -64,6 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP)
     COMPILER_SPECIFIC_FLAGS(
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
+      IntelClang -Xcompiler -fiopenmp
       PGI        -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
@@ -72,6 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP)
   ELSE()
     COMPILER_SPECIFIC_FLAGS(
       Clang      ${ClangOpenMPFlag}
+      IntelClang -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
       PGI        -mp
       Cray       NO-VALUE-SPECIFIED
@@ -152,3 +151,11 @@ IF (KOKKOS_ENABLE_HIP)
 ENDIF()
 
 KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend")
+
+## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros
+IF (KOKKOS_ENABLE_SYCL)
+  IF(KOKKOS_CXX_STANDARD LESS 17)
+    MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!")
+  ENDIF()
+  LIST(APPEND DEVICE_SETUP_LIST SYCL)
+ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake
index 5df498f3735484dea3e2cf39e296d59135fe2774..95bce66c7bee32f8800cbd6e0324f9d4c599c97c 100644
--- a/packages/kokkos/cmake/kokkos_enable_options.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_options.cmake
@@ -48,6 +48,7 @@ KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS    OFF "Whether to print all compiler war
 KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded")
 KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tuning tools")
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
+KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
@@ -68,6 +69,15 @@ ELSE()
 ENDIF()
 KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT}  "Whether to align Kokkos::complex to 2*alignof(RealType)")
 
+IF (KOKKOS_ENABLE_TESTS)
+  SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON)
+ELSE()
+  SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF)
+ENDIF()
+KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests")
+IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS)
+  MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.")
+ENDIF()
 
 IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang))
   SET(CUDA_CONSTEXPR_DEFAULT ON)
@@ -76,14 +86,14 @@ ELSE()
 ENDIF()
 KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions")
 
+Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for")
+
 FUNCTION(check_device_specific_options)
   CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN})
   IF(NOT KOKKOS_ENABLE_${SOME_DEVICE})
     FOREACH(OPTION ${SOME_OPTIONS})
-      IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.14)
-        IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}})
-          MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.")
-        ENDIF()
+      IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}})
+        MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.")
       ENDIF()
       IF(KOKKOS_ENABLE_${OPTION})
         MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.")
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index 2b17d648b44b39a6fbdf1b48d8cbd26001aa9030..858322394d7aefcb9fe23f55a60863f3a8f63484 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -169,9 +169,7 @@ MACRO(kokkos_export_imported_tpl NAME)
       ENDIF()
 
       SET(TPL_LINK_OPTIONS)
-      IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0")
-        GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS)
-      ENDIF()
+      GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS)
       IF(TPL_LINK_OPTIONS)
         KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}")
       ENDIF()
@@ -230,9 +228,7 @@ MACRO(kokkos_import_tpl NAME)
   # I have still been getting errors about ROOT variables being ignored
   # I'm not sure if this is a scope issue - but make sure
   # the policy is set before we do any find_package calls
-  IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
-    CMAKE_POLICY(SET CMP0074 NEW)
-  ENDIF()
+  CMAKE_POLICY(SET CMP0074 NEW)
 
   IF (KOKKOS_ENABLE_${NAME})
     #Tack on a TPL here to make sure we avoid using anyone else's find
@@ -314,7 +310,7 @@ MACRO(kokkos_create_imported_tpl NAME)
   CMAKE_PARSE_ARGUMENTS(TPL
    "INTERFACE"
    "LIBRARY"
-   "LINK_LIBRARIES;INCLUDES;COMPILE_OPTIONS;LINK_OPTIONS"
+   "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS"
    ${ARGN})
 
 
@@ -334,6 +330,9 @@ MACRO(kokkos_create_imported_tpl NAME)
     IF(TPL_INCLUDES)
       TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES})
     ENDIF()
+    IF(TPL_COMPILE_DEFINITIONS)
+      TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS})
+    ENDIF()
     IF(TPL_COMPILE_OPTIONS)
       TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS})
     ENDIF()
@@ -355,6 +354,10 @@ MACRO(kokkos_create_imported_tpl NAME)
       SET_TARGET_PROPERTIES(${NAME} PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}")
     ENDIF()
+    IF(TPL_COMPILE_DEFINITIONS)
+      SET_TARGET_PROPERTIES(${NAME} PROPERTIES
+        INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}")
+    ENDIF()
     IF(TPL_COMPILE_OPTIONS)
       SET_TARGET_PROPERTIES(${NAME} PROPERTIES
         INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}")
@@ -770,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu)
+  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
@@ -926,6 +929,9 @@ ENDFUNCTION()
 #       DIRECTORY   --> all files in directory
 #       PROJECT     --> all files/targets in a project/subproject
 #
+# NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in.
+# This version explicitly uses nvcc_wrapper.
+#
 FUNCTION(kokkos_compilation)
     # check whether the compiler already supports building CUDA
     KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA)
@@ -947,10 +953,21 @@ FUNCTION(kokkos_compilation)
         MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'")
     ENDIF()
 
+    # find nvcc_wrapper
+    FIND_PROGRAM(Kokkos_NVCC_WRAPPER
+        NAMES           nvcc_wrapper
+        HINTS           ${PROJECT_SOURCE_DIR}
+        PATHS           ${PROJECT_SOURCE_DIR}
+        PATH_SUFFIXES   bin)
+
+    IF(NOT Kokkos_COMPILE_LAUNCHER)
+        MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'")
+    ENDIF()
+
     IF(COMP_GLOBAL)
         # if global, don't bother setting others
-        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
-        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
+        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}")
+        SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}")
     ELSE()
         FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE)
             # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...)
@@ -961,8 +978,8 @@ FUNCTION(kokkos_compilation)
             # set the properties if defined
             IF(COMP_${_TYPE})
                 # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}")
-                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
-                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}")
+                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}")
+                SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}")
             ENDIF()
         ENDFOREACH()
     ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
index 1d7da922eb6ee931436631c648f2a1109e8bde0d..707fb000af528694780d6668f160a3fee3472a69 100644
--- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -86,6 +86,19 @@ ELSE()
   MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20")
 ENDIF()
 
+# Enforce that we can compile a simple C++14 program
+
+TRY_COMPILE(CAN_COMPILE_CPP14
+  ${KOKKOS_TOP_BUILD_DIR}/corner_cases
+  ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus14.cpp
+  OUTPUT_VARIABLE ERROR_MESSAGE
+  CXX_STANDARD 14
+)
+if (NOT CAN_COMPILE_CPP14)
+  UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this
+  MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++14 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}")
+ENDIF()
+UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this
 
 
 # Enforce that extensions are turned off for nvcc_wrapper.
diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake
index b58d3696ea9a412d9f008c0ba4e03a142a9fc5fc..d8d044c9d75384a1d8d312a94708623c735d121f 100644
--- a/packages/kokkos/cmake/kokkos_tpls.cmake
+++ b/packages/kokkos/cmake/kokkos_tpls.cmake
@@ -1,5 +1,6 @@
 KOKKOS_CFG_DEPENDS(TPLS OPTIONS)
 KOKKOS_CFG_DEPENDS(TPLS DEVICES)
+KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID)
 
 FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT)
   CMAKE_PARSE_ARGUMENTS(PARSED
@@ -38,6 +39,12 @@ IF(KOKKOS_ENABLE_MEMKIND)
 ENDIF()
 KOKKOS_TPL_OPTION(CUDA    ${Kokkos_ENABLE_CUDA} TRIBITS CUDA)
 KOKKOS_TPL_OPTION(LIBRT   Off)
+IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+  SET(ROCM_DEFAULT ON)
+ELSE()
+  SET(ROCM_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(ROCM    ${ROCM_DEFAULT})
 
 IF (WIN32)
   SET(LIBDL_DEFAULT Off)
@@ -70,6 +77,7 @@ KOKKOS_IMPORT_TPL(LIBRT)
 KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
+KOKKOS_IMPORT_TPL(ROCM INTERFACE)
 
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake
index 059fb192f05153843c131a6d89fc4adf8fb202cf..afa036066afeef954c5fed457782546565b7cfa5 100644
--- a/packages/kokkos/cmake/kokkos_tribits.cmake
+++ b/packages/kokkos/cmake/kokkos_tribits.cmake
@@ -141,39 +141,54 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME)
 ENDFUNCTION()
 
 FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME)
-CMAKE_PARSE_ARGUMENTS(PARSE
-  ""
-  ""
-  "SOURCES;CATEGORIES;ARGS"
-  ${ARGN})
-VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS})
-
-IF (KOKKOS_HAS_TRILINOS)
-  IF(DEFINED PARSE_ARGS)
-    STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}")
-  ENDIF()
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    ${ROOT_NAME}
-    SOURCES ${PARSE_SOURCES}
-    TESTONLYLIBS kokkos_gtest
-    NUM_MPI_PROCS 1
-    COMM serial mpi
-    ARGS ${PARSE_ARGS}
-    CATEGORIES ${PARSE_CATEGORIES}
-    SOURCES ${PARSE_SOURCES}
-    FAIL_REGULAR_EXPRESSION "  FAILED  "
-    ARGS ${PARSE_ARGS}
-  )
-ELSE()
-  KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME}
-    SOURCES ${PARSE_SOURCES}
-  )
-  KOKKOS_ADD_TEST(NAME ${ROOT_NAME}
-    EXE ${ROOT_NAME}
-    FAIL_REGULAR_EXPRESSION "  FAILED  "
-    ARGS ${PARSE_ARGS}
-  )
-ENDIF()
+    CMAKE_PARSE_ARGUMENTS(PARSE
+    ""
+    ""
+    "SOURCES;CATEGORIES;ARGS"
+    ${ARGN})
+    VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS})
+
+    IF (KOKKOS_HAS_TRILINOS)
+        IF(DEFINED PARSE_ARGS)
+            STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}")
+        ENDIF()
+        TRIBITS_ADD_EXECUTABLE_AND_TEST(
+            ${ROOT_NAME}
+            SOURCES ${PARSE_SOURCES}
+            TESTONLYLIBS kokkos_gtest
+            NUM_MPI_PROCS 1
+            COMM serial mpi
+            ARGS ${PARSE_ARGS}
+            CATEGORIES ${PARSE_CATEGORIES}
+            SOURCES ${PARSE_SOURCES}
+            FAIL_REGULAR_EXPRESSION "  FAILED  "
+            ARGS ${PARSE_ARGS}
+        )
+    ELSE()
+        KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME}
+            SOURCES ${PARSE_SOURCES}
+        )
+        IF (PARSE_ARGS)
+            SET(TEST_NUMBER 0)
+            FOREACH (ARG_STR ${PARSE_ARGS})
+                # This is passed as a single string blob to match TriBITS behavior
+                # We need this to be turned into a list
+                STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR})
+                LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}")
+                MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1")
+                KOKKOS_ADD_TEST(NAME ${TEST_NAME}
+                    EXE ${ROOT_NAME}
+                    FAIL_REGULAR_EXPRESSION "  FAILED  "
+                    ARGS ${ARG_STR_LIST}
+                )
+            ENDFOREACH()
+        ELSE()
+            KOKKOS_ADD_TEST(NAME ${ROOT_NAME}
+                EXE ${ROOT_NAME}
+                FAIL_REGULAR_EXPRESSION "  FAILED  "
+            )
+        ENDIF()
+    ENDIF()
 ENDFUNCTION()
 
 FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME)
@@ -301,11 +316,26 @@ ENDMACRO()
 ##                        Includes generated header files, scripts such as nvcc_wrapper and hpcbind,
 ##                        as well as other files provided through plugins.
 MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES)
-  # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper
+
+  # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler
+  # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler
+  IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper")
+    SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}")
+  ELSE()
+    IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "")
+        SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}")
+    ENDIF()
+  ENDIF()
+
+  CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler
+    ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler
+    @ONLY)
+
   INSTALL(PROGRAMS
           "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper"
           "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind"
           "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler"
+          "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler"
           DESTINATION ${CMAKE_INSTALL_BINDIR})
   INSTALL(FILES
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h"
@@ -313,7 +343,7 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES)
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp"
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp"
           "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp"
-          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+          DESTINATION ${KOKKOS_HEADER_DIR})
 ENDMACRO()
 
 FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME)
@@ -330,24 +360,12 @@ FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME)
       ${LIBRARY_NAME} PUBLIC
       $<$<LINK_LANGUAGE:CXX>:${KOKKOS_LINK_OPTIONS}>
     )
-  ELSEIF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13")
+  ELSE()
     #I can use link options
     #just assume CXX linkage
     TARGET_LINK_OPTIONS(
       ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}
     )
-  ELSE()
-    #assume CXX linkage, we have no good way to check otherwise
-    IF (PARSE_PLAIN_STYLE)
-      TARGET_LINK_LIBRARIES(
-        ${LIBRARY_NAME} ${KOKKOS_LINK_OPTIONS}
-      )
-    ELSE()
-      #well, have to do it the wrong way for now
-      TARGET_LINK_LIBRARIES(
-        ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}
-      )
-    ENDIF()
   ENDIF()
 
   TARGET_COMPILE_OPTIONS(
@@ -448,6 +466,13 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME)
     ${PARSE_SOURCES}
   )
 
+  IF(PARSE_SHARED OR BUILD_SHARED_LIBS)
+    SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES
+      VERSION   ${Kokkos_VERSION}
+      SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}
+    )
+  ENDIF()
+
   KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME})
 
   #In case we are building in-tree, add an alias name
diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt
index 7000624b6bcfca69bf2bae30bdae7d971a067a63..98655896d4f351418fc60e5330cd194fa2358d0e 100644
--- a/packages/kokkos/containers/src/CMakeLists.txt
+++ b/packages/kokkos/containers/src/CMakeLists.txt
@@ -26,8 +26,6 @@ KOKKOS_ADD_LIBRARY(
   HEADERS ${KOKKOS_CONTAINER_HEADERS}
 )
 
-SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION})
-
 KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers
   ${KOKKOS_TOP_BUILD_DIR}
   ${CMAKE_CURRENT_BINARY_DIR}
@@ -36,4 +34,3 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers
 KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore)
 
 #-----------------------------------------------------------------------------
-
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
index 689f0eb2ed4e14597ce22d284060fd9b5576eb18..45710d1f737ca14348dd79d698bbc4a581225bbb 100644
--- a/packages/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -91,6 +91,25 @@ namespace Kokkos {
  *     behavior.  Please see the documentation of Kokkos::View for
  *     examples.  The default suffices for most users.
  */
+
+namespace Impl {
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; }
+
+inline const Kokkos::Cuda& get_cuda_space() {
+  return *Kokkos::Impl::cuda_get_deep_copy_space();
+}
+
+template <typename NonCudaExecSpace>
+inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) {
+  return get_cuda_space();
+}
+
+#endif  // KOKKOS_ENABLE_CUDA
+
+}  // namespace Impl
 template <class DataType, class Arg1Type = void, class Arg2Type = void,
           class Arg3Type = void>
 class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
@@ -295,6 +314,53 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
           "DualView constructed with incompatible views");
     }
   }
+  // does the DualView have only one device
+  struct impl_dualview_is_single_device {
+    enum : bool {
+      value = std::is_same<typename t_dev::device_type,
+                           typename t_host::device_type>::value
+    };
+  };
+
+  // does the given device match the device of t_dev?
+  template <typename Device>
+  struct impl_device_matches_tdev_device {
+    enum : bool {
+      value = std::is_same<typename t_dev::device_type, Device>::value
+    };
+  };
+  // does the given device match the device of t_host?
+  template <typename Device>
+  struct impl_device_matches_thost_device {
+    enum : bool {
+      value = std::is_same<typename t_host::device_type, Device>::value
+    };
+  };
+
+  // does the given device match the execution space of t_host?
+  template <typename Device>
+  struct impl_device_matches_thost_exec {
+    enum : bool {
+      value = std::is_same<typename t_host::execution_space, Device>::value
+    };
+  };
+
+  // does the given device match the execution space of t_dev?
+  template <typename Device>
+  struct impl_device_matches_tdev_exec {
+    enum : bool {
+      value = std::is_same<typename t_dev::execution_space, Device>::value
+    };
+  };
+
+  // does the given device's memory space match the memory space of t_dev?
+  template <typename Device>
+  struct impl_device_matches_tdev_memory_space {
+    enum : bool {
+      value = std::is_same<typename t_dev::memory_space,
+                           typename Device::memory_space>::value
+    };
+  };
 
   //@}
   //! \name Methods for synchronizing, marking as modified, and getting Views.
@@ -302,7 +368,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 
   /// \brief Return a View on a specific device \c Device.
   ///
-  /// Please don't be afraid of the if_c expression in the return
+  /// Please don't be afraid of the nested if_c expressions in the return
   /// value's type.  That just tells the method what the return type
   /// should be: t_dev if the \c Device template parameter matches
   /// this DualView's device type, else t_host.
@@ -323,10 +389,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   ///   typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
   /// \endcode
   template <class Device>
-  KOKKOS_INLINE_FUNCTION const typename Impl::if_c<
-      std::is_same<typename t_dev::memory_space,
-                   typename Device::memory_space>::value,
-      t_dev, t_host>::type&
+  KOKKOS_INLINE_FUNCTION const typename std::conditional_t<
+      impl_device_matches_tdev_device<Device>::value, t_dev,
+      typename std::conditional_t<
+          impl_device_matches_thost_device<Device>::value, t_host,
+          typename std::conditional_t<
+              impl_device_matches_thost_exec<Device>::value, t_host,
+              typename std::conditional_t<
+                  impl_device_matches_tdev_exec<Device>::value, t_dev,
+                  typename std::conditional_t<
+                      impl_device_matches_tdev_memory_space<Device>::value,
+                      t_dev, t_host> > > > >
   view() const {
     constexpr bool device_is_memspace =
         std::is_same<Device, typename Device::memory_space>::value;
@@ -463,6 +536,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
           true);
     }
   }
+
   /// \brief Update data on device or host only if data in the other
   ///   space has been marked as modified.
   ///
@@ -480,12 +554,9 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   ///   the data in either View.  You must manually mark modified data
   ///   as modified, by calling the modify() method with the
   ///   appropriate template parameter.
-  template <class Device>
-  void sync(const typename std::enable_if<
-                (std::is_same<typename traits::data_type,
-                              typename traits::non_const_data_type>::value) ||
-                    (std::is_same<Device, int>::value),
-                int>::type& = 0) {
+  // deliberately passing args by cref as they're used multiple times
+  template <class Device, class... Args>
+  void sync_impl(std::true_type, Args const&... args) {
     if (modified_flags.data() == nullptr) return;
 
     int dev = get_device_side<Device>();
@@ -497,12 +568,12 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                          Kokkos::CudaUVMSpace>::value) {
           if (d_view.data() == h_view.data())
             Kokkos::Impl::cuda_prefetch_pointer(
-                Kokkos::Cuda(), d_view.data(),
+                Impl::get_cuda_space(args...), d_view.data(),
                 sizeof(typename t_dev::value_type) * d_view.span(), true);
         }
 #endif
 
-        deep_copy(d_view, h_view);
+        deep_copy(args..., d_view, h_view);
         modified_flags(0) = modified_flags(1) = 0;
         impl_report_device_sync();
       }
@@ -514,12 +585,12 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                          Kokkos::CudaUVMSpace>::value) {
           if (d_view.data() == h_view.data())
             Kokkos::Impl::cuda_prefetch_pointer(
-                Kokkos::Cuda(), d_view.data(),
+                Impl::get_cuda_space(args...), d_view.data(),
                 sizeof(typename t_dev::value_type) * d_view.span(), false);
         }
 #endif
 
-        deep_copy(h_view, d_view);
+        deep_copy(args..., h_view, d_view);
         modified_flags(0) = modified_flags(1) = 0;
         impl_report_host_sync();
       }
@@ -533,10 +604,26 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 
   template <class Device>
   void sync(const typename std::enable_if<
-                (!std::is_same<typename traits::data_type,
-                               typename traits::non_const_data_type>::value) ||
+                (std::is_same<typename traits::data_type,
+                              typename traits::non_const_data_type>::value) ||
                     (std::is_same<Device, int>::value),
                 int>::type& = 0) {
+    sync_impl<Device>(std::true_type{});
+  }
+
+  template <class Device, class ExecutionSpace>
+  void sync(const ExecutionSpace& exec,
+            const typename std::enable_if<
+                (std::is_same<typename traits::data_type,
+                              typename traits::non_const_data_type>::value) ||
+                    (std::is_same<Device, int>::value),
+                int>::type& = 0) {
+    sync_impl<Device>(std::true_type{}, exec);
+  }
+
+  // deliberately passing args by cref as they're used multiple times
+  template <class Device, class... Args>
+  void sync_impl(std::false_type, Args const&...) {
     if (modified_flags.data() == nullptr) return;
 
     int dev = get_device_side<Device>();
@@ -557,7 +644,27 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
-  void sync_host() {
+  template <class Device>
+  void sync(const typename std::enable_if<
+                (!std::is_same<typename traits::data_type,
+                               typename traits::non_const_data_type>::value) ||
+                    (std::is_same<Device, int>::value),
+                int>::type& = 0) {
+    sync_impl<Device>(std::false_type{});
+  }
+  template <class Device, class ExecutionSpace>
+  void sync(const ExecutionSpace& exec,
+            const typename std::enable_if<
+                (!std::is_same<typename traits::data_type,
+                               typename traits::non_const_data_type>::value) ||
+                    (std::is_same<Device, int>::value),
+                int>::type& = 0) {
+    sync_impl<Device>(std::false_type{}, exec);
+  }
+
+  // deliberately passing args by cref as they're used multiple times
+  template <typename... Args>
+  void sync_host_impl(Args const&... args) {
     if (!std::is_same<typename traits::data_type,
                       typename traits::non_const_data_type>::value)
       Impl::throw_runtime_exception(
@@ -569,18 +676,26 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                        Kokkos::CudaUVMSpace>::value) {
         if (d_view.data() == h_view.data())
           Kokkos::Impl::cuda_prefetch_pointer(
-              Kokkos::Cuda(), d_view.data(),
+              Impl::get_cuda_space(args...), d_view.data(),
               sizeof(typename t_dev::value_type) * d_view.span(), false);
       }
 #endif
 
-      deep_copy(h_view, d_view);
+      deep_copy(args..., h_view, d_view);
       modified_flags(1) = modified_flags(0) = 0;
       impl_report_host_sync();
     }
   }
 
-  void sync_device() {
+  template <class ExecSpace>
+  void sync_host(const ExecSpace& exec) {
+    sync_host_impl(exec);
+  }
+  void sync_host() { sync_host_impl(); }
+
+  // deliberately passing args by cref as they're used multiple times
+  template <typename... Args>
+  void sync_device_impl(Args const&... args) {
     if (!std::is_same<typename traits::data_type,
                       typename traits::non_const_data_type>::value)
       Impl::throw_runtime_exception(
@@ -592,17 +707,23 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                        Kokkos::CudaUVMSpace>::value) {
         if (d_view.data() == h_view.data())
           Kokkos::Impl::cuda_prefetch_pointer(
-              Kokkos::Cuda(), d_view.data(),
+              Impl::get_cuda_space(args...), d_view.data(),
               sizeof(typename t_dev::value_type) * d_view.span(), true);
       }
 #endif
 
-      deep_copy(d_view, h_view);
+      deep_copy(args..., d_view, h_view);
       modified_flags(1) = modified_flags(0) = 0;
       impl_report_device_sync();
     }
   }
 
+  template <class ExecSpace>
+  void sync_device(const ExecSpace& exec) {
+    sync_device_impl(exec);
+  }
+  void sync_device() { sync_device_impl(); }
+
   template <class Device>
   bool need_sync() const {
     if (modified_flags.data() == nullptr) return false;
@@ -658,6 +779,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   template <class Device>
   void modify() {
     if (modified_flags.data() == nullptr) return;
+    if (impl_dualview_is_single_device::value) return;
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -690,6 +812,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   }
 
   inline void modify_host() {
+    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(0) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -710,6 +833,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   }
 
   inline void modify_device() {
+    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(1) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index c66d7a5f36caabc18c4559e85855529dbfae15b6..c6323fef93694de1ee39d5784141bf6991f78bd7 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -245,13 +245,10 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds(
     return (size_t(i) < map.extent(R)) &&
            dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...);
   } else if (i != 0) {
-    // FIXME_SYCL SYCL doesn't allow printf in kernels
-#ifndef KOKKOS_ENABLE_SYCL
-    printf(
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
         "DynRankView Debug Bounds Checking Error: at rank %u\n  Extra "
         "arguments beyond the rank must be zero \n",
         R);
-#endif
     return (false) &&
            dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...);
   } else {
@@ -575,37 +572,22 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
                      (is_layout_left || is_layout_right || is_layout_stride)
   };
 
-  template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
-  struct verify_space {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {}
-  };
-
-  template <class Space>
-  struct verify_space<Space, false> {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {
-      Kokkos::abort(
-          "Kokkos::DynRankView ERROR: attempt to access inaccessible memory "
-          "space");
-    };
-  };
-
 // Bounds checking macros
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
 
 // rank of the calling operator - included as first argument in ARG
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)             \
-  DynRankView::template verify_space<                     \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \
-  Kokkos::Impl::dyn_rank_view_verify_operator_bounds<     \
-      typename traits::memory_space>                      \
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)                          \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();  \
+  Kokkos::Impl::dyn_rank_view_verify_operator_bounds<                  \
+      typename traits::memory_space>                                   \
       ARG;
 
 #else
 
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \
-  DynRankView::template verify_space<         \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check();
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)                          \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();
 
 #endif
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index 06bd5566619926b5bb4c6f55e8a3166f90dcdb4b..cc949d4c556ab4abd982ea5334fee870c42ef305 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -76,6 +76,12 @@ struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
   using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
 };
 #endif
+#ifdef KOKKOS_ENABLE_SYCL
+template <>
+struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
+};
+#endif
 }  // end namespace Impl
 
 /** \brief Dynamic views are restricted to rank-one and no layout.
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 4fd084338ed731213d12792aca31826fcd89e75e..0f21a08ba3ba86ed176dc4c4535ef76c960e90bc 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -377,34 +377,20 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
       std::is_same<typename traits::specialize, void>::value &&
       (is_layout_left || is_layout_right || is_layout_stride);
 
-  template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
-  struct verify_space {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {}
-  };
-
-  template <class Space>
-  struct verify_space<Space, false> {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {
-      Kokkos::abort(
-          "Kokkos::View ERROR: attempt to access inaccessible memory space");
-    };
-  };
-
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
 
-#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG)              \
-  OffsetView::template verify_space<                             \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check();        \
-  Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \
-      typename traits::memory_space>                             \
+#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG)                    \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();  \
+  Kokkos::Experimental::Impl::offsetview_verify_operator_bounds<       \
+      typename traits::memory_space>                                   \
       ARG;
 
 #else
 
-#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \
-  OffsetView::template verify_space<                \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check();
+#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG)                    \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();
 
 #endif
  public:
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index 5e18f5a80eaba9ab4227bc648a7548d4bcb9802a..dcd4cf73e5d710bc427772a8a8de6384e80c9dae 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -649,13 +649,13 @@ struct ReduceDuplicatesBase {
   size_t stride;
   size_t start;
   size_t n;
-  ReduceDuplicatesBase(ValueType const* src_in, ValueType* dest_in,
-                       size_t stride_in, size_t start_in, size_t n_in,
-                       std::string const& name)
+  ReduceDuplicatesBase(ExecSpace const& exec_space, ValueType const* src_in,
+                       ValueType* dest_in, size_t stride_in, size_t start_in,
+                       size_t n_in, std::string const& name)
       : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) {
     parallel_for(
         std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]",
-        RangePolicy<ExecSpace, size_t>(0, stride),
+        RangePolicy<ExecSpace, size_t>(exec_space, 0, stride),
         static_cast<Derived const&>(*this));
   }
 };
@@ -667,9 +667,10 @@ template <typename ExecSpace, typename ValueType, typename Op>
 struct ReduceDuplicates
     : public ReduceDuplicatesBase<ExecSpace, ValueType, Op> {
   using Base = ReduceDuplicatesBase<ExecSpace, ValueType, Op>;
-  ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in,
-                   size_t start_in, size_t n_in, std::string const& name)
-      : Base(src_in, dst_in, stride_in, start_in, n_in, name) {}
+  ReduceDuplicates(ExecSpace const& exec_space, ValueType const* src_in,
+                   ValueType* dst_in, size_t stride_in, size_t start_in,
+                   size_t n_in, std::string const& name)
+      : Base(exec_space, src_in, dst_in, stride_in, start_in, n_in, name) {}
   KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
     for (size_t j = Base::start; j < Base::n; ++j) {
       ScatterValue<ValueType, Op, ExecSpace,
@@ -687,12 +688,12 @@ template <typename ExecSpace, typename ValueType, typename Op>
 struct ResetDuplicatesBase {
   using Derived = ResetDuplicates<ExecSpace, ValueType, Op>;
   ValueType* data;
-  ResetDuplicatesBase(ValueType* data_in, size_t size_in,
-                      std::string const& name)
+  ResetDuplicatesBase(ExecSpace const& exec_space, ValueType* data_in,
+                      size_t size_in, std::string const& name)
       : data(data_in) {
     parallel_for(
         std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]",
-        RangePolicy<ExecSpace, size_t>(0, size_in),
+        RangePolicy<ExecSpace, size_t>(exec_space, 0, size_in),
         static_cast<Derived const&>(*this));
   }
 };
@@ -703,8 +704,9 @@ struct ResetDuplicatesBase {
 template <typename ExecSpace, typename ValueType, typename Op>
 struct ResetDuplicates : public ResetDuplicatesBase<ExecSpace, ValueType, Op> {
   using Base = ResetDuplicatesBase<ExecSpace, ValueType, Op>;
-  ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name)
-      : Base(data_in, size_in, name) {}
+  ResetDuplicates(ExecSpace const& exec_space, ValueType* data_in,
+                  size_t size_in, std::string const& name)
+      : Base(exec_space, data_in, size_in, name) {}
   KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
     ScatterValue<ValueType, Op, ExecSpace,
                  Kokkos::Experimental::ScatterNonAtomic>
@@ -713,6 +715,16 @@ struct ResetDuplicates : public ResetDuplicatesBase<ExecSpace, ValueType, Op> {
   }
 };
 
+template <typename... P>
+void check_scatter_view_allocation_properties_argument(
+    ViewCtorProp<P...> const&) {
+  static_assert(ViewCtorProp<P...>::has_execution_space &&
+                    ViewCtorProp<P...>::has_label &&
+                    ViewCtorProp<P...>::initialize,
+                "Allocation property must have an execution name as well as a "
+                "label, and must perform the view initialization");
+}
+
 }  // namespace Experimental
 }  // namespace Impl
 }  // namespace Kokkos
@@ -762,10 +774,26 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
   ScatterView(View<RT, RP...> const& original_view)
       : internal_view(original_view) {}
 
+  template <typename RT, typename... P, typename... RP>
+  ScatterView(execution_space const& /* exec_space */,
+              View<RT, RP...> const& original_view)
+      : internal_view(original_view) {}
+
   template <typename... Dims>
   ScatterView(std::string const& name, Dims... dims)
       : internal_view(name, dims...) {}
 
+  // This overload allows specifying an execution space instance to be
+  // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as
+  // first argument.
+  template <typename... P, typename... Dims>
+  ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, Dims... dims)
+      : internal_view(arg_prop, dims...) {
+    using ::Kokkos::Impl::Experimental::
+        check_scatter_view_allocation_properties_argument;
+    check_scatter_view_allocation_properties_argument(arg_prop);
+  }
+
   template <typename OtherDataType, typename OtherDeviceType>
   KOKKOS_FUNCTION ScatterView(
       const ScatterView<OtherDataType, Layout, OtherDeviceType, Op,
@@ -796,27 +824,41 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
 
   template <typename DT, typename... RP>
   void contribute_into(View<DT, RP...> const& dest) const {
+    contribute_into(execution_space(), dest);
+  }
+
+  template <typename DT, typename... RP>
+  void contribute_into(execution_space const& exec_space,
+                       View<DT, RP...> const& dest) const {
     using dest_type = View<DT, RP...>;
     static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                   "ScatterView contribute destination has different layout");
     static_assert(
-        Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-            memory_space, typename dest_type::memory_space>::value,
+        Kokkos::Impl::SpaceAccessibility<
+            execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView contribute destination memory space not accessible");
     if (dest.data() == internal_view.data()) return;
     Kokkos::Impl::Experimental::ReduceDuplicates<execution_space,
                                                  original_value_type, Op>(
-        internal_view.data(), dest.data(), 0, 0, 1, internal_view.label());
+        exec_space, internal_view.data(), dest.data(), 0, 0, 1,
+        internal_view.label());
   }
 
-  void reset() {
+  void reset(execution_space const& exec_space = execution_space()) {
     Kokkos::Impl::Experimental::ResetDuplicates<execution_space,
                                                 original_value_type, Op>(
-        internal_view.data(), internal_view.size(), internal_view.label());
+        exec_space, internal_view.data(), internal_view.size(),
+        internal_view.label());
   }
   template <typename DT, typename... RP>
   void reset_except(View<DT, RP...> const& view) {
-    if (view.data() != internal_view.data()) reset();
+    reset_except(execution_space(), view);
+  }
+
+  template <typename DT, typename... RP>
+  void reset_except(const execution_space& exec_space,
+                    View<DT, RP...> const& view) {
+    if (view.data() != internal_view.data()) reset(exec_space);
   }
 
   void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0,
@@ -928,10 +970,16 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
 
   template <typename RT, typename... RP>
   ScatterView(View<RT, RP...> const& original_view)
+      : ScatterView(execution_space(), original_view) {}
+
+  template <typename RT, typename... P, typename... RP>
+  ScatterView(execution_space const& exec_space,
+              View<RT, RP...> const& original_view)
       : unique_token(),
         internal_view(
             view_alloc(WithoutInitializing,
-                       std::string("duplicated_") + original_view.label()),
+                       std::string("duplicated_") + original_view.label(),
+                       exec_space),
             unique_token.size(),
             original_view.rank_dynamic > 0 ? original_view.extent(0)
                                            : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -949,14 +997,32 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                            : KOKKOS_IMPL_CTOR_DEFAULT_ARG)
 
   {
-    reset();
+    reset(exec_space);
   }
 
   template <typename... Dims>
   ScatterView(std::string const& name, Dims... dims)
-      : internal_view(view_alloc(WithoutInitializing, name),
+      : ScatterView(view_alloc(execution_space(), name), dims...) {}
+
+  // This overload allows specifying an execution space instance to be
+  // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as
+  // first argument.
+  template <typename... P, typename... Dims>
+  ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, Dims... dims)
+      : internal_view(view_alloc(WithoutInitializing,
+                                 static_cast<::Kokkos::Impl::ViewCtorProp<
+                                     void, std::string> const&>(arg_prop)
+                                     .value),
                       unique_token.size(), dims...) {
-    reset();
+    using ::Kokkos::Impl::Experimental::
+        check_scatter_view_allocation_properties_argument;
+    check_scatter_view_allocation_properties_argument(arg_prop);
+
+    auto const exec_space =
+        static_cast<::Kokkos::Impl::ViewCtorProp<void, execution_space> const&>(
+            arg_prop)
+            .value;
+    reset(exec_space);
   }
 
   template <typename OverrideContribution = Contribution>
@@ -984,37 +1050,51 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
 
   template <typename DT, typename... RP>
   void contribute_into(View<DT, RP...> const& dest) const {
+    contribute_into(execution_space(), dest);
+  }
+
+  template <typename DT, typename... RP>
+  void contribute_into(execution_space const& exec_space,
+                       View<DT, RP...> const& dest) const {
     using dest_type = View<DT, RP...>;
     static_assert(std::is_same<typename dest_type::array_layout,
                                Kokkos::LayoutRight>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-            memory_space, typename dest_type::memory_space>::value,
+        Kokkos::Impl::SpaceAccessibility<
+            execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     bool is_equal = (dest.data() == internal_view.data());
     size_t start  = is_equal ? 1 : 0;
     Kokkos::Impl::Experimental::ReduceDuplicates<execution_space,
                                                  original_value_type, Op>(
-        internal_view.data(), dest.data(), internal_view.stride(0), start,
-        internal_view.extent(0), internal_view.label());
+        exec_space, internal_view.data(), dest.data(), internal_view.stride(0),
+        start, internal_view.extent(0), internal_view.label());
   }
 
-  void reset() {
+  void reset(execution_space const& exec_space = execution_space()) {
     Kokkos::Impl::Experimental::ResetDuplicates<execution_space,
                                                 original_value_type, Op>(
-        internal_view.data(), internal_view.size(), internal_view.label());
+        exec_space, internal_view.data(), internal_view.size(),
+        internal_view.label());
   }
+
   template <typename DT, typename... RP>
   void reset_except(View<DT, RP...> const& view) {
+    reset_except(execution_space(), view);
+  }
+
+  template <typename DT, typename... RP>
+  void reset_except(execution_space const& exec_space,
+                    View<DT, RP...> const& view) {
     if (view.data() != internal_view.data()) {
-      reset();
+      reset(exec_space);
       return;
     }
     Kokkos::Impl::Experimental::ResetDuplicates<execution_space,
                                                 original_value_type, Op>(
-        internal_view.data() + view.size(), internal_view.size() - view.size(),
-        internal_view.label());
+        exec_space, internal_view.data() + view.size(),
+        internal_view.size() - view.size(), internal_view.label());
   }
 
   void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0,
@@ -1075,7 +1155,13 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
   ScatterView() = default;
 
   template <typename RT, typename... RP>
-  ScatterView(View<RT, RP...> const& original_view) : unique_token() {
+  ScatterView(View<RT, RP...> const& original_view)
+      : ScatterView(execution_space(), original_view) {}
+
+  template <typename RT, typename... P, typename... RP>
+  ScatterView(execution_space const& exec_space,
+              View<RT, RP...> const& original_view)
+      : unique_token() {
     size_t arg_N[8] = {original_view.rank > 0 ? original_view.extent(0)
                                               : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                        original_view.rank > 1 ? original_view.extent(1)
@@ -1094,14 +1180,27 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
     arg_N[internal_view_type::rank - 1] = unique_token.size();
     internal_view                       = internal_view_type(
         view_alloc(WithoutInitializing,
-                   std::string("duplicated_") + original_view.label()),
+                   std::string("duplicated_") + original_view.label(),
+                   exec_space),
         arg_N[0], arg_N[1], arg_N[2], arg_N[3], arg_N[4], arg_N[5], arg_N[6],
         arg_N[7]);
-    reset();
+    reset(exec_space);
   }
 
   template <typename... Dims>
-  ScatterView(std::string const& name, Dims... dims) {
+  ScatterView(std::string const& name, Dims... dims)
+      : ScatterView(view_alloc(execution_space(), name), dims...) {}
+
+  // This overload allows specifying an execution space instance to be
+  // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as
+  // first argument.
+  template <typename... P, typename... Dims>
+  ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop,
+              Dims... dims) {
+    using ::Kokkos::Impl::Experimental::
+        check_scatter_view_allocation_properties_argument;
+    check_scatter_view_allocation_properties_argument(arg_prop);
+
     original_view_type original_view;
     size_t arg_N[8] = {original_view.rank > 0 ? original_view.static_extent(0)
                                               : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1120,10 +1219,20 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                        KOKKOS_IMPL_CTOR_DEFAULT_ARG};
     Kokkos::Impl::Experimental::args_to_array(arg_N, 0, dims...);
     arg_N[internal_view_type::rank - 1] = unique_token.size();
+
+    auto const name =
+        static_cast<::Kokkos::Impl::ViewCtorProp<void, std::string> const&>(
+            arg_prop)
+            .value;
     internal_view = internal_view_type(view_alloc(WithoutInitializing, name),
                                        arg_N[0], arg_N[1], arg_N[2], arg_N[3],
                                        arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
-    reset();
+
+    auto const exec_space =
+        static_cast<::Kokkos::Impl::ViewCtorProp<void, execution_space> const&>(
+            arg_prop)
+            .value;
+    reset(exec_space);
   }
 
   template <typename OtherDataType, typename OtherDeviceType>
@@ -1166,6 +1275,12 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
 
   template <typename... RP>
   void contribute_into(View<RP...> const& dest) const {
+    contribute_into(execution_space(), dest);
+  }
+
+  template <typename... RP>
+  void contribute_into(execution_space const& exec_space,
+                       View<RP...> const& dest) const {
     using dest_type = View<RP...>;
     static_assert(
         std::is_same<typename dest_type::value_type,
@@ -1175,34 +1290,42 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                                Kokkos::LayoutLeft>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-            memory_space, typename dest_type::memory_space>::value,
+        Kokkos::Impl::SpaceAccessibility<
+            execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     auto extent   = internal_view.extent(internal_view_type::rank - 1);
     bool is_equal = (dest.data() == internal_view.data());
     size_t start  = is_equal ? 1 : 0;
     Kokkos::Impl::Experimental::ReduceDuplicates<execution_space,
                                                  original_value_type, Op>(
-        internal_view.data(), dest.data(),
+        exec_space, internal_view.data(), dest.data(),
         internal_view.stride(internal_view_type::rank - 1), start, extent,
         internal_view.label());
   }
 
-  void reset() {
+  void reset(execution_space const& exec_space = execution_space()) {
     Kokkos::Impl::Experimental::ResetDuplicates<execution_space,
                                                 original_value_type, Op>(
-        internal_view.data(), internal_view.size(), internal_view.label());
+        exec_space, internal_view.data(), internal_view.size(),
+        internal_view.label());
   }
+
   template <typename DT, typename... RP>
   void reset_except(View<DT, RP...> const& view) {
+    reset_except(execution_space(), view);
+  }
+
+  template <typename DT, typename... RP>
+  void reset_except(execution_space const& exec_space,
+                    View<DT, RP...> const& view) {
     if (view.data() != internal_view.data()) {
-      reset();
+      reset(exec_space);
       return;
     }
     Kokkos::Impl::Experimental::ResetDuplicates<execution_space,
                                                 original_value_type, Op>(
-        internal_view.data() + view.size(), internal_view.size() - view.size(),
-        internal_view.label());
+        exec_space, internal_view.data() + view.size(),
+        internal_view.size() - view.size(), internal_view.label());
   }
 
   void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0,
@@ -1316,21 +1439,21 @@ template <typename Op          = Kokkos::Experimental::ScatterSum,
 ScatterView<
     RT, typename ViewTraits<RT, RP...>::array_layout,
     typename ViewTraits<RT, RP...>::device_type, Op,
-    typename Kokkos::Impl::if_c<
+    std::conditional_t<
         std::is_same<Duplication, void>::value,
         typename Kokkos::Impl::Experimental::DefaultDuplication<
             typename ViewTraits<RT, RP...>::execution_space>::type,
-        Duplication>::type,
-    typename Kokkos::Impl::if_c<
+        Duplication>,
+    std::conditional_t<
         std::is_same<Contribution, void>::value,
         typename Kokkos::Impl::Experimental::DefaultContribution<
             typename ViewTraits<RT, RP...>::execution_space,
-            typename Kokkos::Impl::if_c<
+            typename std::conditional_t<
                 std::is_same<Duplication, void>::value,
                 typename Kokkos::Impl::Experimental::DefaultDuplication<
                     typename ViewTraits<RT, RP...>::execution_space>::type,
-                Duplication>::type>::type,
-        Contribution>::type>
+                Duplication>>::type,
+        Contribution>>
 create_scatter_view(View<RT, RP...> const& original_view) {
   return original_view;  // implicit ScatterView constructor call
 }
@@ -1365,12 +1488,21 @@ create_scatter_view(Op, Duplication, Contribution,
 namespace Kokkos {
 namespace Experimental {
 
+template <typename DT1, typename DT2, typename LY, typename ES, typename OP,
+          typename CT, typename DP, typename... VP>
+void contribute(
+    typename ES::execution_space const& exec_space, View<DT1, VP...>& dest,
+    Kokkos::Experimental::ScatterView<DT2, LY, ES, OP, CT, DP> const& src) {
+  src.contribute_into(exec_space, dest);
+}
+
 template <typename DT1, typename DT2, typename LY, typename ES, typename OP,
           typename CT, typename DP, typename... VP>
 void contribute(
     View<DT1, VP...>& dest,
     Kokkos::Experimental::ScatterView<DT2, LY, ES, OP, CT, DP> const& src) {
-  src.contribute_into(dest);
+  using execution_space = typename ES::execution_space;
+  contribute(execution_space{}, dest, src);
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index d2affda93affa2f8e1d03b72a1cf0e49c415d158..edb0e7261da93bb629cad4e9cc7c7d3118868288 100644
--- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -264,26 +264,24 @@ class UnorderedMap {
  private:
   enum : size_type { invalid_index = ~static_cast<size_type>(0) };
 
-  using impl_value_type =
-      typename Impl::if_c<is_set, int, declared_value_type>::type;
+  using impl_value_type = std::conditional_t<is_set, int, declared_value_type>;
 
-  using key_type_view = typename Impl::if_c<
+  using key_type_view = std::conditional_t<
       is_insertable_map, View<key_type *, device_type>,
-      View<const key_type *, device_type, MemoryTraits<RandomAccess> > >::type;
+      View<const key_type *, device_type, MemoryTraits<RandomAccess> > >;
 
-  using value_type_view =
-      typename Impl::if_c<is_insertable_map || is_modifiable_map,
-                          View<impl_value_type *, device_type>,
-                          View<const impl_value_type *, device_type,
-                               MemoryTraits<RandomAccess> > >::type;
+  using value_type_view = std::conditional_t<
+      is_insertable_map || is_modifiable_map,
+      View<impl_value_type *, device_type>,
+      View<const impl_value_type *, device_type, MemoryTraits<RandomAccess> > >;
 
-  using size_type_view = typename Impl::if_c<
+  using size_type_view = std::conditional_t<
       is_insertable_map, View<size_type *, device_type>,
-      View<const size_type *, device_type, MemoryTraits<RandomAccess> > >::type;
+      View<const size_type *, device_type, MemoryTraits<RandomAccess> > >;
 
   using bitset_type =
-      typename Impl::if_c<is_insertable_map, Bitset<execution_space>,
-                          ConstBitset<execution_space> >::type;
+      std::conditional_t<is_insertable_map, Bitset<execution_space>,
+                         ConstBitset<execution_space> >;
 
   enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
   enum { num_scalars = 3 };
@@ -540,10 +538,7 @@ class UnorderedMap {
           // Previously claimed an unused entry that was not inserted.
           // Release this unused entry immediately.
           if (!m_available_indexes.reset(new_index)) {
-            // FIXME_SYCL SYCL doesn't allow printf in kernels
-#ifndef KOKKOS_ENABLE_SYCL
-            printf("Unable to free existing\n");
-#endif
+            KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n");
           }
         }
 
@@ -659,8 +654,8 @@ class UnorderedMap {
   ///
   /// 'const value_type' via Cuda texture fetch must return by value.
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::if_c<(is_set || has_const_value), impl_value_type,
-                      impl_value_type &>::type
+  std::conditional_t<(is_set || has_const_value), impl_value_type,
+                     impl_value_type &>
   value_at(size_type i) const {
     return m_values[is_set ? 0 : (i < capacity() ? i : capacity())];
   }
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 6e450598d1eb3f1c9b533044bfaa5c46f035d519..6047e60f3dd080b8cfe456627ccc80266e7df66b 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -57,10 +57,22 @@
 namespace Kokkos {
 namespace Impl {
 
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned rotate_left(unsigned i, int r) {
+  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
+  return r ? ((i << r) | (i >> (size - r))) : i;
+}
+
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
-  enum { size = static_cast<int>(sizeof(unsigned) * CHAR_BIT) };
+  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
+  // FIXME_SYCL llvm.fshr.i32 missing
+  // (https://github.com/intel/llvm/issues/3308)
+#ifdef __SYCL_DEVICE_ONLY__
+  return rotate_left(i, size - r);
+#else
   return r ? ((i >> r) | (i << (size - r))) : i;
+#endif
 }
 
 template <typename Bitset>
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
index b06ab0846c9a0f3d2dd2a082191030be68de5ae5..d7c4a5d1ffdf9969e3c158473e7fb5754113a665 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -250,8 +250,8 @@ struct UnorderedMapPrint {
     uint32_t list = m_map.m_hash_lists(i);
     for (size_type curr = list, ii = 0; curr != invalid_index;
          curr = m_map.m_next_index[curr], ++ii) {
-      printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr),
-             m_map.value_at(curr));
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii,
+                                    m_map.key_at(curr), m_map.value_at(curr));
     }
   }
 };
diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt
index c84c5f6d5ec30ce6c9267dbd6c4719926fe81287..947d222c273dc4d87823ad3560a1af6c62a1e52b 100644
--- a/packages/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt
@@ -2,6 +2,7 @@
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
 foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
   # Because there is always an exception to the rule
@@ -41,11 +42,6 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
       configure_file(${dir}/dummy.cpp ${file})
       list(APPEND UnitTestSources ${file})
     endforeach()
-    list(REMOVE_ITEM UnitTestSources
-        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp
-        )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
   endif()
 endforeach()
diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile
index f42b9b75190790ef693dc1b065781a32d61207e7..82669fe1ab7532b69556cafbb7131b595f9e5f8e 100644
--- a/packages/kokkos/containers/unit_tests/Makefile
+++ b/packages/kokkos/containers/unit_tests/Makefile
@@ -26,7 +26,7 @@ override LDFLAGS += -lpthread
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
-KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files
 
 TEST_TARGETS =
 TARGETS =
diff --git a/packages/kokkos/containers/unit_tests/TestCuda_Category.hpp b/packages/kokkos/containers/unit_tests/TestCuda_Category.hpp
deleted file mode 100644
index 50935d7a34d1d2fe69311f33c71aaefb19f45080..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestCuda_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_CUDA_HPP
-#define KOKKOS_TEST_CUDA_HPP
-
-#define TEST_CATEGORY cuda
-#define TEST_EXECSPACE Kokkos::Cuda
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
index 531caf0f85ce286cefae2f603c0f947dadf81594..3eee85ed10bd81bc8b511afa9f0fbde7ba244b8f 100644
--- a/packages/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -114,6 +114,8 @@ struct test_dualview_combinations {
 
     a.template modify<typename ViewType::execution_space>();
     a.template sync<typename ViewType::host_mirror_space>();
+    a.template sync<typename ViewType::host_mirror_space>(
+        Kokkos::DefaultExecutionSpace{});
 
     a.h_view(5, 1) = 3;
     a.h_view(6, 1) = 4;
@@ -122,11 +124,15 @@ struct test_dualview_combinations {
     ViewType b = Kokkos::subview(a, std::pair<unsigned int, unsigned int>(6, 9),
                                  std::pair<unsigned int, unsigned int>(0, 1));
     a.template sync<typename ViewType::execution_space>();
+    a.template sync<typename ViewType::execution_space>(
+        Kokkos::DefaultExecutionSpace{});
     b.template modify<typename ViewType::execution_space>();
 
     Kokkos::deep_copy(b.d_view, 2);
 
     a.template sync<typename ViewType::host_mirror_space>();
+    a.template sync<typename ViewType::host_mirror_space>(
+        Kokkos::DefaultExecutionSpace{});
     Scalar count = 0;
     for (unsigned int i = 0; i < a.d_view.extent(0); i++)
       for (unsigned int j = 0; j < a.d_view.extent(1); j++)
@@ -180,6 +186,7 @@ struct test_dual_view_deep_copy {
     } else {
       a.modify_device();
       a.sync_host();
+      a.sync_host(Kokkos::DefaultExecutionSpace{});
     }
 
     // Check device view is initialized as expected
@@ -208,6 +215,7 @@ struct test_dual_view_deep_copy {
       b.template sync<typename ViewType::host_mirror_space>();
     } else {
       b.sync_host();
+      b.sync_host(Kokkos::DefaultExecutionSpace{});
     }
 
     // Perform same checks on b as done on a
@@ -302,6 +310,7 @@ struct test_dualview_resize {
     ASSERT_EQ(a.extent(1), m / factor);
 
     a.sync_device();
+    a.sync_device(Kokkos::DefaultExecutionSpace{});
 
     // Check device view is initialized as expected
     a_d_sum = 0;
@@ -404,19 +413,14 @@ void test_dualview_resize() {
   Impl::test_dualview_resize<Scalar, Device>();
 }
 
-// FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
 TEST(TEST_CATEGORY, dualview_combination) {
   test_dualview_combinations<int, TEST_EXECSPACE>(10, true);
 }
-#endif
 
 TEST(TEST_CATEGORY, dualview_alloc) {
   test_dualview_alloc<int, TEST_EXECSPACE>(10);
 }
 
-// FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
 TEST(TEST_CATEGORY, dualview_combinations_without_init) {
   test_dualview_combinations<int, TEST_EXECSPACE>(10, false);
 }
@@ -433,8 +437,133 @@ TEST(TEST_CATEGORY, dualview_realloc) {
 TEST(TEST_CATEGORY, dualview_resize) {
   test_dualview_resize<int, TEST_EXECSPACE>();
 }
+
+namespace {
+/**
+ *
+ * The following tests are a response to
+ * https://github.com/kokkos/kokkos/issues/3850
+ * and
+ * https://github.com/kokkos/kokkos/pull/3857
+ *
+ * DualViews were returning incorrect view types and taking
+ * inappropriate actions based on the templated view methods.
+ *
+ * Specifically, template view methods were always returning
+ * a device view if the memory space was UVM and a Kokkos::Device was passed.
+ * Sync/modify methods completely broke down So these tests exist to make sure
+ * that we keep the semantics of UVM DualViews intact.
+ */
+// modify if we have other UVM enabled backends
+#ifdef KOKKOS_ENABLE_CUDA  // OR other UVM builds
+#define UVM_ENABLED_BUILD
+#endif
+
+#ifdef UVM_ENABLED_BUILD
+template <typename ExecSpace>
+struct UVMSpaceFor;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA  // specific to CUDA
+template <>
+struct UVMSpaceFor<Kokkos::Cuda> {
+  using type = Kokkos::CudaUVMSpace;
+};
+#endif
+
+#ifdef UVM_ENABLED_BUILD
+template <>
+struct UVMSpaceFor<Kokkos::DefaultHostExecutionSpace> {
+  using type = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type;
+};
+#else
+template <typename ExecSpace>
+struct UVMSpaceFor {
+  using type = typename ExecSpace::memory_space;
+};
 #endif
 
+using ExecSpace  = Kokkos::DefaultExecutionSpace;
+using MemSpace   = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type;
+using DeviceType = Kokkos::Device<ExecSpace, MemSpace>;
+
+using DualViewType = Kokkos::DualView<double*, Kokkos::LayoutLeft, DeviceType>;
+using d_device     = DeviceType;
+using h_device     = Kokkos::Device<
+    Kokkos::DefaultHostExecutionSpace,
+    typename UVMSpaceFor<Kokkos::DefaultHostExecutionSpace>::type>;
+
+TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  auto v_d      = dv.template view<d_device>();
+  using vdt     = decltype(v_d);
+  using vdt_d   = vdt::device_type;
+  using vdt_d_e = vdt_d::execution_space;
+  ASSERT_STREQ(vdt_d_e::name(), Kokkos::DefaultExecutionSpace::name());
+}
+TEST(TEST_CATEGORY, dualview_host_correct_kokkos_device) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  auto v_h      = dv.template view<h_device>();
+  using vht     = decltype(v_h);
+  using vht_d   = vht::device_type;
+  using vht_d_e = vht_d::execution_space;
+  ASSERT_STREQ(vht_d_e::name(), Kokkos::DefaultHostExecutionSpace::name());
+}
+
+TEST(TEST_CATEGORY, dualview_host_modify_template_device_sync) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  dv.modify_host();
+  dv.template sync<d_device>();
+  EXPECT_TRUE(!dv.need_sync_device());
+  EXPECT_TRUE(!dv.need_sync_host());
+  dv.clear_sync_state();
+}
+
+TEST(TEST_CATEGORY, dualview_host_modify_template_device_execspace_sync) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  dv.modify_host();
+  dv.template sync<d_device::execution_space>();
+  EXPECT_TRUE(!dv.need_sync_device());
+  EXPECT_TRUE(!dv.need_sync_host());
+  dv.clear_sync_state();
+}
+
+TEST(TEST_CATEGORY, dualview_device_modify_template_host_sync) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  dv.modify_device();
+  dv.template sync<h_device>();
+  EXPECT_TRUE(!dv.need_sync_device());
+  EXPECT_TRUE(!dv.need_sync_host());
+  dv.clear_sync_state();
+}
+TEST(TEST_CATEGORY, dualview_device_modify_template_host_execspace_sync) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  dv.modify_device();
+  dv.template sync<h_device::execution_space>();
+  EXPECT_TRUE(!dv.need_sync_device());
+  EXPECT_TRUE(!dv.need_sync_host());
+  dv.clear_sync_state();
+}
+
+TEST(TEST_CATEGORY,
+     dualview_template_views_return_correct_executionspace_views) {
+  DualViewType dv("myView", 100);
+  dv.clear_sync_state();
+  using hvt = decltype(dv.view<typename Kokkos::DefaultHostExecutionSpace>());
+  using dvt = decltype(dv.view<typename Kokkos::DefaultExecutionSpace>());
+  ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(),
+               dvt::device_type::execution_space::name());
+  ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(),
+               hvt::device_type::execution_space::name());
+}
+
+}  // anonymous namespace
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_DUALVIEW_HPP
diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
index 4b9f9944172452578ebe37675b274385c3ce840c..f018793dd6f3b162acbf9db20174c47ac75fc1c0 100644
--- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -243,8 +243,6 @@ struct TestDynamicView {
   }
 };
 
-// FIXME_SYCL needs resize_serial
-#ifndef KOKKOS_ENABLE_SYCL
 TEST(TEST_CATEGORY, dynamic_view) {
   using TestDynView = TestDynamicView<double, TEST_EXECSPACE>;
 
@@ -252,7 +250,6 @@ TEST(TEST_CATEGORY, dynamic_view) {
     TestDynView::run(100000 + 100 * i);
   }
 }
-#endif
 
 }  // namespace Test
 
diff --git a/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp b/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp
deleted file mode 100644
index 64fc7c0757baca29e2c0e02099a4234330378eb7..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestHPX_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_HPX_HPP
-#define KOKKOS_TEST_HPX_HPP
-
-#define TEST_CATEGORY hpx
-#define TEST_EXECSPACE Kokkos::Experimental::HPX
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
index 802813b13b81d9f0b048aeec7b17cccae2507ce3..9ddc226e291f6e7dc7d6bc960fad470fafeb9974 100644
--- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -130,8 +130,6 @@ void test_offsetview_construction() {
     }
   }
 
-  // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
   const int ovmin0 = ov.begin(0);
   const int ovend0 = ov.end(0);
   const int ovmin1 = ov.begin(1);
@@ -178,7 +176,6 @@ void test_offsetview_construction() {
   }
 
   ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView";
-#endif
 #endif
 
   {
@@ -215,8 +212,6 @@ void test_offsetview_construction() {
                                   point3_type{{extent0, extent1, extent2}});
 
 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-    // FIXME_SYCL requires MDRange policy
-#ifdef KOKKOS_ENABLE_SYCL
     int view3DSum = 0;
     Kokkos::parallel_reduce(
         rangePolicy3DZero,
@@ -239,7 +234,6 @@ void test_offsetview_construction() {
 
     ASSERT_EQ(view3DSum, offsetView3DSum)
         << "construction of OffsetView from View and begins array broken.";
-#endif
 #endif
   }
   view_type viewFromOV = ov.view();
@@ -266,8 +260,6 @@ void test_offsetview_construction() {
     Kokkos::deep_copy(aView, ov);
 
 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-    // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
     int sum = 0;
     Kokkos::parallel_reduce(
         rangePolicy2D,
@@ -277,7 +269,6 @@ void test_offsetview_construction() {
         sum);
 
     ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken.";
-#endif
 #endif
   }
 
@@ -288,8 +279,6 @@ void test_offsetview_construction() {
     Kokkos::deep_copy(ov, aView);
 
 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-    // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
     int sum = 0;
     Kokkos::parallel_reduce(
         rangePolicy2D,
@@ -299,7 +288,6 @@ void test_offsetview_construction() {
         sum);
 
     ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken.";
-#endif
 #endif
   }
 }
@@ -471,8 +459,6 @@ void test_offsetview_subview() {
       ASSERT_EQ(offsetSubview.end(1), 9);
 
 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-      // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
       using range_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
                                                Kokkos::IndexType<int> >;
       using point_type = typename range_type::point_type;
@@ -498,7 +484,6 @@ void test_offsetview_subview() {
           sum);
 
       ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1));
-#endif
 #endif
     }
 
@@ -701,12 +686,9 @@ void test_offsetview_offsets_rank3() {
 }
 #endif
 
-// FIXME_SYCL needs MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
 TEST(TEST_CATEGORY, offsetview_construction) {
   test_offsetview_construction<int, TEST_EXECSPACE>();
 }
-#endif
 
 TEST(TEST_CATEGORY, offsetview_unmanaged_construction) {
   test_offsetview_unmanaged_construction<int, TEST_EXECSPACE>();
diff --git a/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp b/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp
deleted file mode 100644
index a0169d170294ad9e7b32d847de09875b37bce8e0..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestOpenMP_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_OPENMP_HPP
-#define KOKKOS_TEST_OPENMP_HPP
-
-#define TEST_CATEGORY openmp
-#define TEST_EXECSPACE Kokkos::OpenMP
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp
deleted file mode 100644
index 51fd3fc91118f55cf68cdac1cf2b532a3978364f..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestSYCL_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_SYCL_HPP
-#define KOKKOS_TEST_SYCL_HPP
-
-#define TEST_CATEGORY sycl
-#define TEST_EXECSPACE Kokkos::Experimental::SYCL
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
index 3a3cb607a64e67908381bdb24e796c6ac40758c7..fdbce2d492009cf38d5491398d77423108edc6a5 100644
--- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -437,6 +437,10 @@ struct test_scatter_view_config {
                                           Contribution, Op,
                                           NumberType>::orig_view_type;
 
+  void compile_constructor() {
+    auto sv = scatter_view_def(Kokkos::view_alloc(DeviceType{}, "label"), 10);
+  }
+
   void run_test(int n) {
     // test allocation
     {
diff --git a/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp b/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp
deleted file mode 100644
index 2aa09a315ae01e70a4267e6214fe478bbd0a9592..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestSerial_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_SERIAL_HPP
-#define KOKKOS_TEST_SERIAL_HPP
-
-#define TEST_CATEGORY serial
-#define TEST_EXECSPACE Kokkos::Serial
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
index 8bb267ce5d9701ea68538f0612f3bdcefcd3a0e0..a9a178f95e7b7fedabcb7b00b292d88603ff3f77 100644
--- a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -285,10 +285,7 @@ void run_test_graph4() {
 
 TEST(TEST_CATEGORY, staticcrsgraph) {
   TestStaticCrsGraph::run_test_graph<TEST_EXECSPACE>();
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   TestStaticCrsGraph::run_test_graph2<TEST_EXECSPACE>();
-#endif
   TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 0);
   TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 1000);
   TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 10000);
diff --git a/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp b/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp
deleted file mode 100644
index 74a2b0da362e3226230c0f11e3a7fc987eb9a615..0000000000000000000000000000000000000000
--- a/packages/kokkos/containers/unit_tests/TestThreads_Category.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_THREADS_HPP
-#define KOKKOS_TEST_THREADS_HPP
-
-#define TEST_CATEGORY threads
-#define TEST_EXECSPACE Kokkos::Threads
-
-#endif
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index d39e0061c747c78abbd30d0284cc398a41714326..4413cfbc80e31271d1e2b830976796ade24aaa9a 100644
--- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -163,7 +163,8 @@ struct TestFind {
   KOKKOS_INLINE_FUNCTION
   void operator()(typename execution_space::size_type i,
                   value_type &errors) const {
-    const bool expect_to_find_i = (i < m_max_key);
+    const bool expect_to_find_i =
+        (i < typename execution_space::size_type(m_max_key));
 
     const bool exists = m_map.exists(i);
 
@@ -293,10 +294,11 @@ void test_deep_copy(uint32_t num_nodes) {
   }
 }
 
-// FIXME_HIP wrong result in CI but works locally
-#ifndef KOKKOS_ENABLE_HIP
+// FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
+// FIXME_HIP
 // WORKAROUND MSVC
-#ifndef _WIN32
+#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
+    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
   for (int i = 0; i < 500; ++i) {
     test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@@ -304,7 +306,6 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) {
   }
 }
 #endif
-#endif
 
 TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
   for (int i = 0; i < 1000; ++i) test_failed_insert<TEST_EXECSPACE>(10000);
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
index b7b817c910974b615f50e5b6bdb76e3429c66d27..9ff4b6006da8cb0358f2a9e53810b79ce59e8b02 100644
--- a/packages/kokkos/core/perf_test/CMakeLists.txt
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -9,6 +9,14 @@
 # that in TriBITS KokkosAlgorithms can be disabled...
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
 
+# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
+IF (KOKKOS_ENABLE_OPENMPTARGET
+    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
+         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+  RETURN()
+ENDIF()
+
+
 SET(SOURCES
   PerfTestMain.cpp
   PerfTestGramSchmidt.cpp
@@ -68,8 +76,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 
 # This test currently times out for MSVC
-# FIXME_SYCL these tests don't compile yet (require parallel_for).
-IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL)
+IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     PerfTestExec
     SOURCES ${SOURCES}
@@ -77,13 +84,11 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL)
   )
 ENDIF()
 
-# FIXME_SYCL
-IF(NOT Kokkos_ENABLE_SYCL)
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    PerformanceTest_Atomic
-    SOURCES test_atomic.cpp
-    CATEGORIES PERFORMANCE
-  )
+KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  PerformanceTest_Atomic
+  SOURCES test_atomic.cpp
+  CATEGORIES PERFORMANCE
+)
 
 IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
@@ -98,7 +103,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST(
   SOURCES test_mempool.cpp
   CATEGORIES PERFORMANCE
 )
-ENDIF()
 
 IF(NOT Kokkos_ENABLE_OPENMPTARGET)
 # FIXME OPENMPTARGET needs tasking
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index 70186283c1a76789b1ab943b3793f36f55b9f258..dee21fd7a575bd5aa0f6838980c670510f475cab 100644
--- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -69,7 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
 
   KOKKOS_INLINE_FUNCTION
   void final(value_type& result) const {
-    result = std::sqrt(result);
+    result = Kokkos::Experimental::sqrt(result);
     Rjj()  = result;
     inv()  = (0 < result) ? 1.0 / result : 0;
   }
@@ -145,7 +145,7 @@ struct ModifiedGramSchmidt {
       // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
       Kokkos::scale(tmp, Qj);
 
-      for (size_t k = j + 1; k < count; ++k) {
+      for (size_type k = j + 1; k < count; ++k) {
         const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k);
         const value_view Rjk = Kokkos::subview(R_, j, k);
 
@@ -165,7 +165,7 @@ struct ModifiedGramSchmidt {
 
   //--------------------------------------------------------------------------
 
-  static double test(const size_t length, const size_t count,
+  static double test(const size_type length, const size_type count,
                      const size_t iter = 1) {
     multivector_type Q_("Q", length, count);
     multivector_type R_("R", count, count);
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index e0590a78a4bce924e7b71de13f67603a275a8464..2ab0989805723ce32115d379dd39708b5edd8209 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -72,8 +72,6 @@ KOKKOS_ADD_LIBRARY(
   ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags
 )
 
-SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION})
-
 KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore
   ${KOKKOS_TOP_BUILD_DIR}
   ${CMAKE_CURRENT_BINARY_DIR}
@@ -87,3 +85,4 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
+KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 4a30c914f0808c675c3f7f5b3a88a1f94322b149..916f109758de4ba3cf469659d7458ae77cf464da 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -45,6 +45,10 @@
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA
 
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
 #include <cstdlib>
 #include <iostream>
 #include <sstream>
@@ -52,10 +56,6 @@
 #include <algorithm>
 #include <atomic>
 
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Cuda.hpp>
-#include <Kokkos_CudaSpace.hpp>
-
 //#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
@@ -65,6 +65,22 @@
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() {
+  static cudaStream_t s = nullptr;
+  if (s == nullptr) {
+    cudaStreamCreate(&s);
+  }
+  return s;
+}
+
+const std::unique_ptr<Kokkos::Cuda> &Kokkos::Impl::cuda_get_deep_copy_space(
+    bool initialize) {
+  static std::unique_ptr<Cuda> space = nullptr;
+  if (!space && initialize)
+    space = std::make_unique<Cuda>(Kokkos::Impl::cuda_get_deep_copy_stream());
+  return space;
+}
+
 namespace Kokkos {
 namespace Impl {
 
@@ -72,13 +88,6 @@ namespace {
 
 static std::atomic<int> num_uvm_allocations(0);
 
-cudaStream_t get_deep_copy_stream() {
-  static cudaStream_t s = nullptr;
-  if (s == nullptr) {
-    cudaStreamCreate(&s);
-  }
-  return s;
-}
 }  // namespace
 
 DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
@@ -115,7 +124,7 @@ DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
-  cudaStream_t s = get_deep_copy_stream();
+  cudaStream_t s = cuda_get_deep_copy_stream();
   CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
   cudaStreamSynchronize(s);
 }
@@ -128,14 +137,14 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
 
 namespace Kokkos {
 
-void CudaSpace::access_error() {
+KOKKOS_DEPRECATED void CudaSpace::access_error() {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
       "non-Cuda space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
 
-void CudaSpace::access_error(const void *const) {
+KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
       "non-Cuda space");
@@ -459,79 +468,6 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
   return tex_obj;
 }
 
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord::get_label()"> {{{1
-
-std::string SharedAllocationRecord<Kokkos::CudaSpace, void>::get_label() const {
-  SharedAllocationHeader header;
-
-  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, Kokkos::CudaSpace>(
-      &header, RecordBase::head(), sizeof(SharedAllocationHeader));
-
-  return std::string(header.m_label);
-}
-
-std::string SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::get_label()
-    const {
-  return std::string(RecordBase::head()->m_label);
-}
-
-std::string
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::get_label() const {
-  return std::string(RecordBase::head()->m_label);
-}
-
-// </editor-fold> end SharedAllocationRecord::get_label() }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord allocate()"> {{{1
-
-SharedAllocationRecord<Kokkos::CudaSpace, void>
-    *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate(
-        const Kokkos::CudaSpace &arg_space, const std::string &arg_label,
-        const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
-    *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate(
-        const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label,
-        const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
-    *SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate(
-        const Kokkos::CudaHostPinnedSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-// </editor-fold> end SharedAllocationRecord allocate() }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord deallocate"> {{{1
-
-void SharedAllocationRecord<Kokkos::CudaSpace, void>::deallocate(
-    SharedAllocationRecord<void, void> *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
-void SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::deallocate(
-    SharedAllocationRecord<void, void> *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
-void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::deallocate(
-    SharedAllocationRecord<void, void> *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
-// </editor-fold> end SharedAllocationRecord deallocate }}}1
-//==============================================================================
-
 //==============================================================================
 // <editor-fold desc="SharedAllocationRecord destructors"> {{{1
 
@@ -580,7 +516,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
     const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
 #endif
@@ -592,13 +528,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
 
   SharedAllocationHeader header;
 
-  // Fill in the Header information
-  header.m_record = static_cast<SharedAllocationRecord<void, void> *>(this);
-
-  strncpy(header.m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
 
   // Copy to device memory
   Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(RecordBase::m_alloc_ptr, &header,
@@ -611,7 +541,7 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
     const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
 #endif
@@ -620,16 +550,8 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
       m_tex_obj(0),
       m_space(arg_space) {
-  // Fill in the Header information, directly accessible via UVM
-
-  RecordBase::m_alloc_ptr->m_record = this;
-
-  strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-
-  // Set last element zero, in case c_str is too long
-  RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
 }
 
 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
@@ -639,7 +561,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
                                   void>::s_root_record,
@@ -648,319 +570,13 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
                                                arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
       m_space(arg_space) {
-  // Fill in the Header information, directly accessible on the host
-
-  RecordBase::m_alloc_ptr->m_record = this;
-
-  strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
 }
 
 // </editor-fold> end SharedAllocationRecord constructors }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecored::(re|de|)allocate_tracked"> {{{1
-
-void *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate_tracked(
-    const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label,
-    const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::CudaSpace, void>::deallocate_tracked(
-    void *const arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void *SharedAllocationRecord<Kokkos::CudaSpace, void>::reallocate_tracked(
-    void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<CudaSpace, CudaSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate_tracked(
-    const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label,
-    const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::deallocate_tracked(
-    void *const arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::reallocate_tracked(
-    void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<CudaUVMSpace, CudaUVMSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-void *
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate_tracked(
-    const Kokkos::CudaHostPinnedSpace &arg_space,
-    const std::string &arg_alloc_label, const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
-                            void>::deallocate_tracked(void *const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void *
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::reallocate_tracked(
-    void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-// </editor-fold> end SharedAllocationRecored::(re|de|)allocate_tracked }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord::get_record()"> {{{1
-
-SharedAllocationRecord<Kokkos::CudaSpace, void> *
-SharedAllocationRecord<Kokkos::CudaSpace, void>::get_record(void *alloc_ptr) {
-  using RecordCuda = SharedAllocationRecord<Kokkos::CudaSpace, void>;
-
-  using Header = SharedAllocationHeader;
-
-  // Copy the header from the allocation
-  Header head;
-
-  Header const *const head_cuda =
-      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
-
-  if (alloc_ptr) {
-    Kokkos::Impl::DeepCopy<HostSpace, CudaSpace>(
-        &head, head_cuda, sizeof(SharedAllocationHeader));
-  }
-
-  RecordCuda *const record =
-      alloc_ptr ? static_cast<RecordCuda *>(head.m_record) : nullptr;
-
-  if (!alloc_ptr || record->m_alloc_ptr != head_cuda) {
-    Kokkos::Impl::throw_runtime_exception(
-        std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , "
-                    "void >::get_record ERROR"));
-  }
-
-  return record;
-}
-
-SharedAllocationRecord<Kokkos::CudaUVMSpace, void> *SharedAllocationRecord<
-    Kokkos::CudaUVMSpace, void>::get_record(void *alloc_ptr) {
-  using Header     = SharedAllocationHeader;
-  using RecordCuda = SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;
-
-  Header *const h =
-      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;
-
-  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
-    Kokkos::Impl::throw_runtime_exception(
-        std::string("Kokkos::Impl::SharedAllocationRecord< "
-                    "Kokkos::CudaUVMSpace , void >::get_record ERROR"));
-  }
-
-  return static_cast<RecordCuda *>(h->m_record);
-}
-
-SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
-    *SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::get_record(
-        void *alloc_ptr) {
-  using Header     = SharedAllocationHeader;
-  using RecordCuda = SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
-
-  Header *const h =
-      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;
-
-  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
-    Kokkos::Impl::throw_runtime_exception(
-        std::string("Kokkos::Impl::SharedAllocationRecord< "
-                    "Kokkos::CudaHostPinnedSpace , void >::get_record ERROR"));
-  }
-
-  return static_cast<RecordCuda *>(h->m_record);
-}
-
-// </editor-fold> end SharedAllocationRecord::get_record() }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="SharedAllocationRecord::print_records()"> {{{1
-
-// Iterate records to print orphaned memory ...
-void SharedAllocationRecord<Kokkos::CudaSpace, void>::print_records(
-    std::ostream &s, const Kokkos::CudaSpace &, bool detail) {
-  (void)s;
-  (void)detail;
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void> *r = &s_root_record;
-
-  char buffer[256];
-
-  SharedAllocationHeader head;
-
-  if (detail) {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<HostSpace, CudaSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-      } else {
-        head.m_label[0] = 0;
-      }
-
-      // Formatting dependent on sizeof(uintptr_t)
-      const char *format_string;
-
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-        format_string =
-            "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx "
-            "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
-      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-        format_string =
-            "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ "
-            "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
-      }
-
-      snprintf(buffer, 256, format_string, reinterpret_cast<uintptr_t>(r),
-               reinterpret_cast<uintptr_t>(r->m_prev),
-               reinterpret_cast<uintptr_t>(r->m_next),
-               reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size,
-               r->m_count, reinterpret_cast<uintptr_t>(r->m_dealloc),
-               head.m_label);
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  } else {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<HostSpace, CudaSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-
-        // Formatting dependent on sizeof(uintptr_t)
-        const char *format_string;
-
-        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-          format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
-        } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-          format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
-        }
-
-        snprintf(buffer, 256, format_string,
-                 reinterpret_cast<uintptr_t>(r->data()), r->size(),
-                 head.m_label);
-      } else {
-        snprintf(buffer, 256, "Cuda [ 0 + 0 ]\n");
-      }
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  }
-#else
-  Kokkos::Impl::throw_runtime_exception(
-      "SharedAllocationHeader<CudaSpace>::print_records only works with "
-      "KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
-
-void SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::print_records(
-    std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) {
-  (void)s;
-  (void)detail;
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>::print_host_accessible_records(
-      s, "CudaUVM", &s_root_record, detail);
-#else
-  Kokkos::Impl::throw_runtime_exception(
-      "SharedAllocationHeader<CudaSpace>::print_records only works with "
-      "KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
-
-void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::print_records(
-    std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) {
-  (void)s;
-  (void)detail;
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>::print_host_accessible_records(
-      s, "CudaHostPinned", &s_root_record, detail);
-#else
-  Kokkos::Impl::throw_runtime_exception(
-      "SharedAllocationHeader<CudaSpace>::print_records only works with "
-      "KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
-
-// </editor-fold> end SharedAllocationRecord::print_records() }}}1
-//==============================================================================
-
 void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
@@ -984,6 +600,29 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 
 }  // namespace Impl
 }  // namespace Kokkos
+
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicity instantiate these CRTP base classes here,
+// where we have access to the associated *_timpl.hpp header files.
+template class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
+template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
+template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
+template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
+
 #else
 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
 #endif  // KOKKOS_ENABLE_CUDA
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 0d6d3bdb3ac5389e894f01ad4edff845b63b7b53..0f4259072d97f26c0032e674bdf60b9031fcee11 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -140,7 +140,7 @@ inline int cuda_deduce_block_size(bool early_termination,
       }
     }
 
-    if (early_termination && blocks_per_sm != 0) break;
+    if (early_termination && opt_block_size != 0) break;
   }
 
   return opt_block_size;
@@ -222,7 +222,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
       case 52:
       case 61: return 96;
       case 70:
-      case 80: return 8;
+      case 80:
+      case 86: return 8;
       case 75: return 32;
       default:
         Kokkos::Impl::throw_runtime_exception(
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
index a9a62380e5a4c26289bd96a08c3814ade0832cf1..ec9c434fe663900a5d5029896a5c98ce13266605 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@@ -175,30 +175,42 @@ class half_t {
     return cast_from_half<unsigned long long>(*this);
   }
 
+  /**
+   * Conversion constructors.
+   *
+   * Support implicit conversions from impl_type, float, double -> half_t
+   * Mixed precision expressions require upcasting which is done in the
+   * "// Binary Arithmetic" operator overloads below.
+   *
+   * Support implicit conversions from integral types -> half_t.
+   * Expressions involving half_t with integral types require downcasting
+   * the integral types to half_t. Existing operator overloads can handle this
+   * with the addition of the below implicit conversion constructors.
+   */
   KOKKOS_FUNCTION
   half_t(impl_type rhs) : val(rhs) {}
   KOKKOS_FUNCTION
-  explicit half_t(float rhs) : val(cast_to_half(rhs).val) {}
+  half_t(float rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {}
+  half_t(double rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(double rhs) : val(cast_to_half(rhs).val) {}
+  explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(short rhs) : val(cast_to_half(rhs).val) {}
+  half_t(short rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(int rhs) : val(cast_to_half(rhs).val) {}
+  half_t(int rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(long rhs) : val(cast_to_half(rhs).val) {}
+  half_t(long rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {}
+  half_t(long long rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {}
+  half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {}
+  half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {}
+  half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {}
   KOKKOS_FUNCTION
-  explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {}
+  half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {}
 
   // Unary operators
   KOKKOS_FUNCTION
@@ -243,7 +255,7 @@ class half_t {
 #else
     float tmp = __half2float(val);
     --tmp;
-    val     = __float2half(tmp);
+    val = __float2half(tmp);
 #endif
     return *this;
   }
@@ -276,88 +288,317 @@ class half_t {
     return *this;
   }
 
+  template <class T>
+  KOKKOS_FUNCTION void operator=(T rhs) volatile {
+    val = cast_to_half(rhs).val;
+  }
+
   // Compound operators
   KOKKOS_FUNCTION
   half_t& operator+=(half_t rhs) {
 #ifdef __CUDA_ARCH__
     val += rhs.val;
 #else
-    val     = __float2half(__half2float(val) + __half2float(rhs.val));
+    val = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
     return *this;
   }
 
+  KOKKOS_FUNCTION
+  volatile half_t& operator+=(half_t rhs) volatile {
+#ifdef __CUDA_ARCH__
+    // Cuda 10 supports __half volatile stores but not volatile arithmetic
+    // operands. Cast away volatile-ness of val for arithmetic but not for store
+    // location.
+    val = const_cast<impl_type&>(val) + rhs.val;
+#else
+    // Use non-volatile val_ref to suppress:
+    // "warning: implicit dereference will not access object of type โ€˜volatile
+    // __halfโ€™ in statement"
+    auto val_ref = const_cast<impl_type&>(val);
+    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
+                           __half2float(rhs.val));
+#endif
+    return *this;
+  }
+
+  // Compund operators: upcast overloads for +=
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator+=(T& lhs, half_t rhs) {
+    lhs += static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator+=(float rhs) {
+    float result = static_cast<float>(val) + rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator+=(double rhs) {
+    double result = static_cast<double>(val) + rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
   KOKKOS_FUNCTION
   half_t& operator-=(half_t rhs) {
 #ifdef __CUDA_ARCH__
     val -= rhs.val;
 #else
-    val     = __float2half(__half2float(val) - __half2float(rhs.val));
+    val          = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
     return *this;
   }
 
+  KOKKOS_FUNCTION
+  volatile half_t& operator-=(half_t rhs) volatile {
+#ifdef __CUDA_ARCH__
+    // Cuda 10 supports __half volatile stores but not volatile arithmetic
+    // operands. Cast away volatile-ness of val for arithmetic but not for store
+    // location.
+    val = const_cast<impl_type&>(val) - rhs.val;
+#else
+    // Use non-volatile val_ref to suppress:
+    // "warning: implicit dereference will not access object of type โ€˜volatile
+    // __halfโ€™ in statement"
+    auto val_ref = const_cast<impl_type&>(val);
+    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
+                           __half2float(rhs.val));
+#endif
+    return *this;
+  }
+
+  // Compund operators: upcast overloads for -=
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator-=(T& lhs, half_t rhs) {
+    lhs -= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator-=(float rhs) {
+    float result = static_cast<float>(val) - rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator-=(double rhs) {
+    double result = static_cast<double>(val) - rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
   KOKKOS_FUNCTION
   half_t& operator*=(half_t rhs) {
 #ifdef __CUDA_ARCH__
     val *= rhs.val;
 #else
-    val     = __float2half(__half2float(val) * __half2float(rhs.val));
+    val          = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
     return *this;
   }
 
+  KOKKOS_FUNCTION
+  volatile half_t& operator*=(half_t rhs) volatile {
+#ifdef __CUDA_ARCH__
+    // Cuda 10 supports __half volatile stores but not volatile arithmetic
+    // operands. Cast away volatile-ness of val for arithmetic but not for store
+    // location.
+    val = const_cast<impl_type&>(val) * rhs.val;
+#else
+    // Use non-volatile val_ref to suppress:
+    // "warning: implicit dereference will not access object of type โ€˜volatile
+    // __halfโ€™ in statement"
+    auto val_ref = const_cast<impl_type&>(val);
+    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
+                           __half2float(rhs.val));
+#endif
+    return *this;
+  }
+
+  // Compund operators: upcast overloads for *=
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator*=(T& lhs, half_t rhs) {
+    lhs *= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator*=(float rhs) {
+    float result = static_cast<float>(val) * rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator*=(double rhs) {
+    double result = static_cast<double>(val) * rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
   KOKKOS_FUNCTION
   half_t& operator/=(half_t rhs) {
 #ifdef __CUDA_ARCH__
     val /= rhs.val;
 #else
-    val     = __float2half(__half2float(val) / __half2float(rhs.val));
+    val          = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
     return *this;
   }
 
+  KOKKOS_FUNCTION
+  volatile half_t& operator/=(half_t rhs) volatile {
+#ifdef __CUDA_ARCH__
+    // Cuda 10 supports __half volatile stores but not volatile arithmetic
+    // operands. Cast away volatile-ness of val for arithmetic but not for store
+    // location.
+    val = const_cast<impl_type&>(val) / rhs.val;
+#else
+    // Use non-volatile val_ref to suppress:
+    // "warning: implicit dereference will not access object of type โ€˜volatile
+    // __halfโ€™ in statement"
+    auto val_ref = const_cast<impl_type&>(val);
+    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
+                           __half2float(rhs.val));
+#endif
+    return *this;
+  }
+
+  // Compund operators: upcast overloads for /=
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator/=(T& lhs, half_t rhs) {
+    lhs /= static_cast<T>(rhs);
+    return lhs;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator/=(float rhs) {
+    float result = static_cast<float>(val) / rhs;
+    val          = static_cast<impl_type>(result);
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  half_t& operator/=(double rhs) {
+    double result = static_cast<double>(val) / rhs;
+    val           = static_cast<impl_type>(result);
+    return *this;
+  }
+
   // Binary Arithmetic
   KOKKOS_FUNCTION
   half_t friend operator+(half_t lhs, half_t rhs) {
 #ifdef __CUDA_ARCH__
     lhs.val += rhs.val;
 #else
-    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
     return lhs;
   }
 
+  // Binary Arithmetic upcast operators for +
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator+(half_t lhs, T rhs) {
+    return T(lhs) + rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator+(T lhs, half_t rhs) {
+    return lhs + T(rhs);
+  }
+
   KOKKOS_FUNCTION
   half_t friend operator-(half_t lhs, half_t rhs) {
 #ifdef __CUDA_ARCH__
     lhs.val -= rhs.val;
 #else
-    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
     return lhs;
   }
 
+  // Binary Arithmetic upcast operators for -
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator-(half_t lhs, T rhs) {
+    return T(lhs) - rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator-(T lhs, half_t rhs) {
+    return lhs - T(rhs);
+  }
+
   KOKKOS_FUNCTION
   half_t friend operator*(half_t lhs, half_t rhs) {
 #ifdef __CUDA_ARCH__
     lhs.val *= rhs.val;
 #else
-    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
     return lhs;
   }
 
+  // Binary Arithmetic upcast operators for *
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator*(half_t lhs, T rhs) {
+    return T(lhs) * rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator*(T lhs, half_t rhs) {
+    return lhs * T(rhs);
+  }
+
   KOKKOS_FUNCTION
   half_t friend operator/(half_t lhs, half_t rhs) {
 #ifdef __CUDA_ARCH__
     lhs.val /= rhs.val;
 #else
-    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
     return lhs;
   }
 
+  // Binary Arithmetic upcast operators for /
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator/(half_t lhs, T rhs) {
+    return T(lhs) / rhs;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION std::enable_if_t<
+      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
+  operator/(T lhs, half_t rhs) {
+    return lhs / T(rhs);
+  }
+
   // Logical operators
   KOKKOS_FUNCTION
   bool operator!() const {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index b8e816345873ac756378ae13fee4db1fdf2dcaa6..016cb6cdcbdd37740613724bb99efb9b4c32d7d4 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -54,6 +54,7 @@
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
@@ -248,11 +249,11 @@ void CudaInternal::print_configuration(std::ostream &s) const {
   const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();
 
 #if defined(KOKKOS_ENABLE_CUDA)
-  s << "macro  KOKKOS_ENABLE_CUDA      : defined" << std::endl;
+  s << "macro  KOKKOS_ENABLE_CUDA      : defined\n";
 #endif
 #if defined(CUDA_VERSION)
   s << "macro  CUDA_VERSION          = " << CUDA_VERSION << " = version "
-    << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << std::endl;
+    << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n';
 #endif
 
   for (int i = 0; i < dev_info.m_cudaDevCount; ++i) {
@@ -274,7 +275,6 @@ CudaInternal::~CudaInternal() {
       m_scratchConcurrentBitset) {
     std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
               << std::endl;
-    std::cerr.flush();
   }
 
   m_cudaDev                   = -1;
@@ -358,8 +358,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
     if (m_cudaArch == 0) {
       std::stringstream ss;
-      ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture"
-         << std::endl;
+      ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n";
       std::string msg = ss.str();
       Kokkos::abort(msg.c_str());
     }
@@ -373,7 +372,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
             "compute capability "
          << compiled_major << "." << compiled_minor
          << " on device with compute capability " << cudaProp.major << "."
-         << cudaProp.minor << " is not supported by CUDA!" << std::endl;
+         << cudaProp.minor << " is not supported by CUDA!\n";
       std::string msg = ss.str();
       Kokkos::abort(msg.c_str());
     }
@@ -458,7 +457,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
           Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 
       Record *const r =
-          Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset",
+          Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchBitset",
                            sizeof(uint32_t) * buffer_bound);
 
       Record::increment(r);
@@ -492,17 +491,11 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
 #ifdef KOKKOS_ENABLE_CUDA_UVM
   if (Kokkos::show_warnings() && !cuda_launch_blocking()) {
-    std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into "
-                 "UVMSpace by default"
-              << std::endl;
-    std::cerr << "                                  without setting "
-                 "CUDA_LAUNCH_BLOCKING=1."
-              << std::endl;
-    std::cerr << "                                  The code must call "
-                 "Cuda().fence() after each kernel"
-              << std::endl;
-    std::cerr << "                                  or will likely crash when "
-                 "accessing data on the host."
+    std::cerr << R"warning(
+Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
+                                  without setting CUDA_LAUNCH_BLOCKING=1.
+                                  The code must call Cuda().fence() after each kernel
+                                  or will likely crash when accessing data on the host.)warning"
               << std::endl;
   }
 
@@ -520,19 +513,13 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
   if (Kokkos::show_warnings() &&
       (!visible_devices_one && !force_device_alloc)) {
-    std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into "
-                 "UVMSpace by default"
+    std::cerr << R"warning(
+Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
+                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or
+                                  setting CUDA_VISIBLE_DEVICES.
+                                  This could on multi GPU systems lead to severe performance"
+                                  penalties.)warning"
               << std::endl;
-    std::cerr << "                                  without setting "
-                 "CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or "
-              << std::endl;
-    std::cerr
-        << "                                  setting CUDA_VISIBLE_DEVICES."
-        << std::endl;
-    std::cerr << "                                  This could on multi GPU "
-                 "systems lead to severe performance"
-              << std::endl;
-    std::cerr << "                                  penalties." << std::endl;
   }
 #endif
 
@@ -575,7 +562,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
     if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
 
     Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "InternalScratchFlags",
+        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags",
                          (sizeof(ScratchGrain) * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -600,7 +587,7 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const {
     if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
 
     Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "InternalScratchSpace",
+        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace",
                          (sizeof(ScratchGrain) * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -624,7 +611,7 @@ Cuda::size_type *CudaInternal::scratch_unified(
       Record::decrement(Record::get_record(m_scratchUnified));
 
     Record *const r = Record::allocate(
-        Kokkos::CudaHostPinnedSpace(), "InternalScratchUnified",
+        Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified",
         (sizeof(ScratchGrain) * m_scratchUnifiedCount));
 
     Record::increment(r);
@@ -646,8 +633,9 @@ Cuda::size_type *CudaInternal::scratch_functor(
     if (m_scratchFunctor)
       Record::decrement(Record::get_record(m_scratchFunctor));
 
-    Record *const r = Record::allocate(
-        Kokkos::CudaSpace(), "InternalScratchFunctor", m_scratchFunctorSize);
+    Record *const r =
+        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor",
+                         m_scratchFunctorSize);
 
     Record::increment(r);
 
@@ -662,7 +650,7 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
   if (m_team_scratch_current_size == 0) {
     m_team_scratch_current_size = bytes;
     m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "CudaSpace::ScratchMemory", m_team_scratch_current_size);
+        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
   }
   if ((bytes > m_team_scratch_current_size) ||
       ((bytes < m_team_scratch_current_size) && (force_shrink))) {
@@ -676,6 +664,9 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------
 
 void CudaInternal::finalize() {
+  // skip if finalize() has already been called
+  if (was_finalized) return;
+
   was_finalized = true;
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     // Only finalize this if we're the singleton
@@ -719,6 +710,11 @@ void CudaInternal::finalize() {
   if (this == &singleton()) {
     cudaFreeHost(constantMemHostStaging);
     cudaEventDestroy(constantMemReusable);
+    auto &deep_copy_space =
+        Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
+    if (deep_copy_space)
+      deep_copy_space->impl_internal_space_instance()->finalize();
+    cudaStreamDestroy(cuda_get_deep_copy_stream());
   }
 }
 
@@ -821,62 +817,23 @@ Cuda::size_type Cuda::device_arch() {
 void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }
 
 Cuda::Cuda()
-    : m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) {
+    : m_space_instance(&Impl::CudaInternal::singleton(),
+                       [](Impl::CudaInternal *) {}) {
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
 }
 
 Cuda::Cuda(cudaStream_t stream)
-    : m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) {
+    : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
+        ptr->finalize();
+        delete ptr;
+      }) {
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
   m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
                                stream);
 }
 
-KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept {
-  m_space_instance       = other.m_space_instance;
-  other.m_space_instance = nullptr;
-  m_counter              = other.m_counter;
-  other.m_counter        = nullptr;
-}
-
-KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other)
-    : m_space_instance(other.m_space_instance), m_counter(other.m_counter) {
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
-  if (m_counter) Kokkos::atomic_add(m_counter, 1);
-#endif
-}
-
-KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept {
-  m_space_instance       = other.m_space_instance;
-  other.m_space_instance = nullptr;
-  m_counter              = other.m_counter;
-  other.m_counter        = nullptr;
-  return *this;
-}
-
-KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) {
-  m_space_instance = other.m_space_instance;
-  m_counter        = other.m_counter;
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
-  if (m_counter) Kokkos::atomic_add(m_counter, 1);
-#endif
-  return *this;
-}
-
-KOKKOS_FUNCTION Cuda::~Cuda() noexcept {
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
-  if (m_counter == nullptr) return;
-  int const count = Kokkos::atomic_fetch_sub(m_counter, 1);
-  if (count == 1) {
-    delete m_counter;
-    m_space_instance->finalize();
-    delete m_space_instance;
-  }
-#endif
-}
-
 void Cuda::print_configuration(std::ostream &s, const bool) {
   Impl::CudaInternal::singleton().print_configuration(s);
 }
@@ -924,54 +881,53 @@ void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
 
 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                                const bool detail) {
-  msg << "Device Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_CUDA: ";
-  msg << "yes" << std::endl;
+  msg << "Device Execution Space:\n";
+  msg << "  KOKKOS_ENABLE_CUDA: yes\n";
 
-  msg << "Cuda Atomics:" << std::endl;
+  msg << "Cuda Atomics:\n";
   msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
 #ifdef KOKKOS_ENABLE_CUDA_ATOMICS
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
 
-  msg << "Cuda Options:" << std::endl;
+  msg << "Cuda Options:\n";
   msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
 #ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
 #ifdef KOKKOS_ENABLE_CUDA_UVM
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CUSPARSE: ";
 #ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  msg << "yes" << std::endl;
+  msg << "yes\n";
 #else
-  msg << "no" << std::endl;
+  msg << "no\n";
 #endif
 
   msg << "\nCuda Runtime Configuration:" << std::endl;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index 13773d70c5a8d402d65833a0dbf198405975580f..aaec2c29260a5ad2b82e2daa653a58372253cd4d 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -17,30 +17,24 @@ namespace Kokkos {
 namespace Impl {
 
 struct CudaTraits {
-  enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ };
-  enum : CudaSpace::size_type {
-    WarpIndexMask = 0x001f /* Mask for warpindex */
-  };
-  enum : CudaSpace::size_type {
-    WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */
-  };
-
-  enum : CudaSpace::size_type {
-    ConstantMemoryUsage = 0x008000 /* 32k bytes */
-  };
-  enum : CudaSpace::size_type {
-    ConstantMemoryCache = 0x002000 /*  8k bytes */
-  };
-  enum : CudaSpace::size_type {
-    KernelArgumentLimit = 0x001000 /*  4k bytes */
-  };
-  enum : CudaSpace::size_type {
-    MaxHierarchicalParallelism = 1024 /* team_size * vector_length */
-  };
+  static constexpr CudaSpace::size_type WarpSize = 32 /* 0x0020 */;
+  static constexpr CudaSpace::size_type WarpIndexMask =
+      0x001f; /* Mask for warpindex */
+  static constexpr CudaSpace::size_type WarpIndexShift =
+      5; /* WarpSize == 1 << WarpShift */
+
+  static constexpr CudaSpace::size_type ConstantMemoryUsage =
+      0x008000; /* 32k bytes */
+  static constexpr CudaSpace::size_type ConstantMemoryCache =
+      0x002000; /*  8k bytes */
+  static constexpr CudaSpace::size_type KernelArgumentLimit =
+      0x001000; /*  4k bytes */
+  static constexpr CudaSpace::size_type MaxHierarchicalParallelism =
+      1024; /* team_size * vector_length */
   using ConstantGlobalBufferType =
       unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];
 
-  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+  static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */;
 
   KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
       CudaSpace::size_type i) {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 39404e0bf38f3867136edd93e9ed9d2e11ef0477..d892a893b330772ec5e4306ed20a44f8aa2369f1 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -158,6 +158,9 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
   }
 }
 
+// This function needs to be template on DriverType and LaunchBounds
+// so that the static bool is unique for each type combo
+// KernelFuncPtr does not necessarily contain that type information.
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 inline void configure_shmem_preference(KernelFuncPtr const& func,
                                        bool prefer_shmem) {
@@ -355,8 +358,7 @@ struct CudaParallelLaunchKernelInvoker<
 
     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds,
-                                       decltype(base_t::get_kernel_func())>(
+      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
           base_t::get_kernel_func(), prefer_shmem);
 
       void const* args[] = {&driver};
@@ -449,8 +451,7 @@ struct CudaParallelLaunchKernelInvoker<
 
     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds,
-                                       decltype(base_t::get_kernel_func())>(
+      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
           base_t::get_kernel_func(), prefer_shmem);
 
       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -627,9 +628,8 @@ struct CudaParallelLaunchImpl<
           get_cuda_func_attributes(), block, shmem, prefer_shmem);
 
       Impl::configure_shmem_preference<
-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          decltype(base_t::get_kernel_func())>(base_t::get_kernel_func(),
-                                               prefer_shmem);
+          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+          base_t::get_kernel_func(), prefer_shmem);
 
       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12b7f70a97495fca628580dda12b115cb5c25a12
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp
@@ -0,0 +1,37 @@
+#ifndef KOKKOS_CUDA_MDRANGEPOLICY_HPP_
+#define KOKKOS_CUDA_MDRANGEPOLICY_HPP_
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+
+template <>
+struct default_outer_direction<Kokkos::Cuda> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+template <>
+struct default_inner_direction<Kokkos::Cuda> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+namespace Impl {
+
+// Settings for MDRangePolicy
+template <>
+inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>(
+    const Kokkos::Cuda& space) {
+  TileSizeProperties properties;
+  properties.max_threads =
+      space.impl_internal_space_instance()->m_maxThreadsPerSM;
+  properties.default_largest_tile_size = 16;
+  properties.default_tile_size         = 2;
+  properties.max_total_tile_size       = 512;
+  return properties;
+}
+
+}  // Namespace Impl
+}  // Namespace Kokkos
+#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 131d180980965829968c0554a36ee282d5930ec1..2834e6f3de012b718ae06ebb6f87d7d24e3e5756 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -60,6 +60,7 @@
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
 #include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
 
@@ -67,6 +68,7 @@
 #include <typeinfo>
 
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <impl/KokkosExp_IterateTileGPU.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -474,7 +476,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
   Policy const& get_policy() const { return m_policy; }
 
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     const Member work_stride = blockDim.y * gridDim.x;
     const Member work_end    = m_policy.end();
 
@@ -537,9 +539,23 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
   const Policy m_rp;
 
  public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelFor,
+                           LaunchBounds>::get_cuda_func_attributes();
+    auto const& prop = pol.space().cuda_device_prop();
+    // Limits due to registers/SM, MDRange doesn't have
+    // shared memory constraints
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+  }
   Policy const& get_policy() const { return m_rp; }
-
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
                                     typename Policy::work_tag>(m_rp, m_functor)
         .exec_range();
@@ -689,7 +705,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
  public:
   Policy const& get_policy() const { return m_policy; }
 
-  __device__ inline void operator()(void) const {
+  __device__ inline void operator()() const {
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
@@ -1248,8 +1264,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using DummySHMEMReductionType = int;
 
  public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelReduce,
+                           LaunchBounds>::get_cuda_func_attributes();
+    auto const& prop = pol.space().cuda_device_prop();
+    // Limits due do registers/SM
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+  }
   Policy const& get_policy() const { return m_policy; }
-
   inline __device__ void exec_range(reference_type update) const {
     Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType,
                                             typename Policy::work_tag,
@@ -1258,7 +1287,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         .exec_range();
   }
 
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     /*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
       DummySHMEMReductionType>::select(1,1.0) );
       }
@@ -2074,7 +2103,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
   //----------------------------------------
 
-  __device__ inline void initial(void) const {
+  __device__ inline void initial() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -2110,7 +2139,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
   //----------------------------------------
 
-  __device__ inline void final(void) const {
+  __device__ inline void final() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -2195,7 +2224,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
   //----------------------------------------
 
-  __device__ inline void operator()(void) const {
+  __device__ inline void operator()() const {
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
     if (m_run_serial) {
       typename ValueTraits::value_type value;
@@ -2364,7 +2393,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  __device__ inline void initial(void) const {
+  __device__ inline void initial() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -2400,7 +2429,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  __device__ inline void final(void) const {
+  __device__ inline void final() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -2487,7 +2516,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  __device__ inline void operator()(void) const {
+  __device__ inline void operator()() const {
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
     if (m_run_serial) {
       typename ValueTraits::value_type value;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index 4b472f5d4fd8df7ae91a6ad04c3d3d2e15244196..e7806390155d46fd811a21432d9f9d268c457468 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -661,13 +661,14 @@ KOKKOS_INLINE_FUNCTION
       thread, count);
 }
 
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>
-    ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin,
-                      iType arg_end) {
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember>
+ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin,
+                  iType2 arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>(
-      thread, arg_begin, arg_end);
+      thread, iType(arg_begin), iType(arg_end));
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -983,7 +984,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 
 //----------------------------------------------------------------------------
 
-/** \brief  Intra-thread vector parallel exclusive prefix sum.
+/** \brief  Intra-thread vector parallel scan with reducer.
  *
  *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
  *
@@ -991,25 +992,25 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
  *  thread and a scan operation is performed.
  *  The last call to closure has final == true.
  */
-template <typename iType, class Closure>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
-        loop_boundaries,
-    const Closure& closure) {
+template <typename iType, class Closure, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                      iType, Impl::CudaTeamMember>& loop_boundaries,
+                  const Closure& closure, const ReducerType& reducer) {
   (void)loop_boundaries;
   (void)closure;
+  (void)reducer;
 #ifdef __CUDA_ARCH__
 
-  // Extract value_type from closure
-
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+  using value_type = typename ReducerType::value_type;
+  value_type accum;
+  reducer.init(accum);
+  const value_type identity = accum;
 
   // Loop through boundaries by vector-length chunks
   // must scan at each iteration
 
-  value_type accum = 0;
-
   // All thread "lanes" must loop the same number of times.
   // Determine an loop end for all thread "lanes."
   // Requires:
@@ -1026,44 +1027,68 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0);
 
   for (int i = threadIdx.x; i < end; i += blockDim.x) {
-    value_type val = 0;
+    value_type val = identity;
 
-    // First acquire per-lane contributions:
-    if (i < loop_boundaries.end) closure(i, val, false);
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution
+    // to make the latter in_place_shfl_up an
+    // exclusive scan -- the final accumulation
+    // of i's val will be included in the second
+    // closure call later.
+    if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false);
 
-    value_type sval = val;
-
-    // Bottom up inclusive scan in triangular pattern
+    // Bottom up exclusive scan in triangular pattern
     // where each CUDA thread is the root of a reduction tree
     // from the zeroth "lane" to itself.
     //  [t] += [t-1] if t >= 1
     //  [t] += [t-2] if t >= 2
     //  [t] += [t-4] if t >= 4
     //  ...
-
+    //  This differs from the non-reducer overload, where an inclusive scan was
+    //  implemented, because in general the binary operator cannot be inverted
+    //  and we would not be able to remove the inclusive contribution by
+    //  inversion.
     for (int j = 1; j < (int)blockDim.x; j <<= 1) {
-      value_type tmp = 0;
-      Impl::in_place_shfl_up(tmp, sval, j, blockDim.x, active_mask);
+      value_type tmp = identity;
+      Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask);
       if (j <= (int)threadIdx.x) {
-        sval += tmp;
+        reducer.join(val, tmp);
       }
     }
 
-    // Include accumulation and remove value for exclusive scan:
-    val = accum + sval - val;
+    // Include accumulation
+    reducer.join(val, accum);
 
-    // Provide exclusive scan value:
+    // Update i's contribution into the val
+    // and add it to accum for next round
     if (i < loop_boundaries.end) closure(i, val, true);
-
-    // Accumulate the last value in the inclusive scan:
-    Impl::in_place_shfl(sval, sval, mask, blockDim.x, active_mask);
-
-    accum += sval;
+    Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask);
   }
 
 #endif
 }
 
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+  value_type dummy;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index f24abb377dae3102dd26341d5a733ddfd3281a1a..c55956ede9665bc3005fa570d7ac120404a54d49 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -139,7 +139,7 @@ struct CudaLDGFetch {
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__)
     AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
     return *(reinterpret_cast<ValueType*>(&v));
 #else
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index 05876a9f0226687c30b8f334c77dc65c1ca4e780..fc52e415145218afa2c495e9f055e051e9921305 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -46,6 +46,7 @@
 #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
 
 #include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 89135b6c45b9483af071e6d921583b8954f93ae5..9278d1bdc9efcc2a76183085c974afef41413e3c 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -75,17 +75,6 @@ void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
   hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
       numBlocks, blockSize, sharedmem);
 }
-template <typename DriverType, typename LaunchBounds, bool Large>
-struct HIPGetMaxBlockSize;
-
-template <typename DriverType, typename LaunchBounds>
-int hip_get_max_block_size(typename DriverType::functor_type const &f,
-                           size_t const vector_length,
-                           size_t const shmem_extra_block,
-                           size_t const shmem_extra_thread) {
-  return HIPGetMaxBlockSize<DriverType, LaunchBounds, true>::get_block_size(
-      f, vector_length, shmem_extra_block, shmem_extra_thread);
-}
 
 template <class FunctorType, class LaunchBounds, typename F>
 int hip_internal_get_block_size(const F &condition_check,
@@ -131,10 +120,6 @@ int hip_internal_get_block_size(const F &condition_check,
   int opt_block_size =
       (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
   int opt_threads_per_sm = threads_per_sm;
-  // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
-  // Achieved: %i %i Opt: %i %i\n",block_size,
-  //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
-  //   regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
   block_size -= HIPTraits::WarpSize;
   while (condition_check(blocks_per_sm) &&
          (block_size >= HIPTraits::WarpSize)) {
@@ -160,10 +145,6 @@ int hip_internal_get_block_size(const F &condition_check,
         opt_threads_per_sm = threads_per_sm;
       }
     }
-    // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
-    // Achieved: %i %i Opt: %i %i\n",block_size,
-    //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
-    //   regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
     block_size -= HIPTraits::WarpSize;
   }
   return opt_block_size;
@@ -178,62 +159,6 @@ int hip_get_max_block_size(const HIPInternal *hip_instance,
       [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
       shmem_block, shmem_thread);
 }
-template <typename DriverType, class LaunchBounds>
-struct HIPGetMaxBlockSize<DriverType, LaunchBounds, true> {
-  static int get_block_size(typename DriverType::functor_type const &f,
-                            size_t const vector_length,
-                            size_t const shmem_extra_block,
-                            size_t const shmem_extra_thread) {
-    int numBlocks = 0;
-    int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB;
-    int sharedmem =
-        shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-        ::Kokkos::Impl::FunctorTeamShmemSize<
-            typename DriverType::functor_type>::value(f, blockSize /
-                                                             vector_length);
-
-    hipOccupancy<DriverType, true>(&numBlocks, blockSize, sharedmem);
-
-    if (numBlocks > 0) return blockSize;
-    while (blockSize > HIPTraits::WarpSize && numBlocks == 0) {
-      blockSize /= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-
-      hipOccupancy<DriverType, true>(&numBlocks, blockSize, sharedmem);
-    }
-    int blockSizeUpperBound = blockSize * 2;
-    while (blockSize < blockSizeUpperBound && numBlocks > 0) {
-      blockSize += HIPTraits::WarpSize;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-
-      hipOccupancy<DriverType, true>(&numBlocks, blockSize, sharedmem);
-    }
-    return blockSize - HIPTraits::WarpSize;
-  }
-};
-
-template <typename DriverType, typename LaunchBounds, bool Large>
-struct HIPGetOptBlockSize;
-
-template <typename DriverType, typename LaunchBounds>
-int hip_get_opt_block_size(typename DriverType::functor_type const &f,
-                           size_t const vector_length,
-                           size_t const shmem_extra_block,
-                           size_t const shmem_extra_thread) {
-  return HIPGetOptBlockSize<
-      DriverType, LaunchBounds,
-      (HIPTraits::ConstantMemoryUseThreshold <
-       sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block,
-                                            shmem_extra_thread);
-}
 
 template <typename FunctorType, typename LaunchBounds>
 int hip_get_opt_block_size(HIPInternal const *hip_instance,
@@ -245,157 +170,6 @@ int hip_get_opt_block_size(HIPInternal const *hip_instance,
       shmem_block, shmem_thread);
 }
 
-// FIXME_HIP the code is identical to the false struct except for
-// hip_parallel_launch_constant_memory
-template <typename DriverType>
-struct HIPGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, true> {
-  static int get_block_size(typename DriverType::functor_type const &f,
-                            size_t const vector_length,
-                            size_t const shmem_extra_block,
-                            size_t const shmem_extra_thread) {
-    int blockSize = HIPTraits::WarpSize / 2;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-
-    while (blockSize < HIPTraits::MaxThreadsPerBlock) {
-      blockSize *= 2;
-
-      // calculate the occupancy with that optBlockSize and check whether its
-      // larger than the largest one found so far
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-      hipOccupancy<DriverType, true>(&numBlocks, blockSize, sharedmem);
-      if (maxOccupancy < numBlocks * blockSize) {
-        maxOccupancy  = numBlocks * blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-    return bestBlockSize;
-  }
-};
-
-template <typename DriverType>
-struct HIPGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, false> {
-  static int get_block_size(const typename DriverType::functor_type &f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = HIPTraits::WarpSize / 2;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-
-    while (blockSize < HIPTraits::MaxThreadsPerBlock) {
-      blockSize *= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-
-      hipOccupancy<DriverType, false>(&numBlocks, blockSize, sharedmem);
-
-      if (maxOccupancy < numBlocks * blockSize) {
-        maxOccupancy  = numBlocks * blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-    return bestBlockSize;
-  }
-};
-
-// FIXME_HIP the code is identical to the false struct except for
-// hip_parallel_launch_constant_memory
-template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct HIPGetOptBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    true> {
-  static int get_block_size(const typename DriverType::functor_type &f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = HIPTraits::WarpSize / 2;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-    int max_threads_per_block =
-        std::min(MaxThreadsPerBlock,
-                 hip_internal_maximum_warp_count() * HIPTraits::WarpSize);
-
-    while (blockSize < max_threads_per_block) {
-      blockSize *= 2;
-
-      // calculate the occupancy with that optBlockSize and check whether its
-      // larger than the largest one found so far
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-      hipOccupancy<DriverType, true, MaxThreadsPerBlock, MinBlocksPerSM>(
-          &numBlocks, blockSize, sharedmem);
-      if (numBlocks >= static_cast<int>(MinBlocksPerSM) &&
-          blockSize <= static_cast<int>(MaxThreadsPerBlock)) {
-        if (maxOccupancy < numBlocks * blockSize) {
-          maxOccupancy  = numBlocks * blockSize;
-          bestBlockSize = blockSize;
-        }
-      }
-    }
-    if (maxOccupancy > 0) return bestBlockSize;
-    return -1;
-  }
-};
-
-template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct HIPGetOptBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    false> {
-  static int get_block_size(const typename DriverType::functor_type &f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = HIPTraits::WarpSize / 2;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-    int max_threads_per_block =
-        std::min(MaxThreadsPerBlock,
-                 hip_internal_maximum_warp_count() * HIPTraits::WarpSize);
-
-    while (blockSize < max_threads_per_block) {
-      blockSize *= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          ::Kokkos::Impl::FunctorTeamShmemSize<
-              typename DriverType::functor_type>::value(f, blockSize /
-                                                               vector_length);
-
-      hipOccupancy<DriverType, false, MaxThreadsPerBlock, MinBlocksPerSM>(
-          &numBlocks, blockSize, sharedmem);
-      if (numBlocks >= int(MinBlocksPerSM) &&
-          blockSize <= int(MaxThreadsPerBlock)) {
-        if (maxOccupancy < numBlocks * blockSize) {
-          maxOccupancy  = numBlocks * blockSize;
-          bestBlockSize = blockSize;
-        }
-      }
-    }
-    if (maxOccupancy > 0) return bestBlockSize;
-    return -1;
-  }
-};
-
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 45512038acafee993aaf50d752ade2763279c45a..18ef10e22cd39b30118f78882a3ce747c19b9901 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -164,6 +164,8 @@ HIPInternal &HIPInternal::singleton() {
 
 void HIPInternal::fence() const {
   HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+  // can reset our cycle id now as well
+  m_cycleId = 0;
 }
 
 void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
@@ -256,7 +258,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
                                                void>;
 
       Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(),
-                                         "InternalScratchBitset",
+                                         "Kokkos::InternalScratchBitset",
                                          sizeof(uint32_t) * buffer_bound);
 
       Record::increment(r);
@@ -303,8 +305,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space(
         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
                                              void>;
 
-    static Record *const r = Record::allocate(
-        Kokkos::Experimental::HIPSpace(), "InternalScratchSpace",
+    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
+
+    Record *const r = Record::allocate(
+        Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace",
         (sizeScratchGrain * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -325,8 +329,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
                                              void>;
 
+    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
+
     Record *const r = Record::allocate(
-        Kokkos::Experimental::HIPSpace(), "InternalScratchFlags",
+        Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags",
         (sizeScratchGrain * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -345,7 +351,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
   if (m_team_scratch_current_size == 0) {
     m_team_scratch_current_size = bytes;
     m_team_scratch_ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::HIPSpace>(
-        "HIPSpace::ScratchMemory", m_team_scratch_current_size);
+        "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size);
   }
   if ((bytes > m_team_scratch_current_size) ||
       ((bytes < m_team_scratch_current_size) && (force_shrink))) {
@@ -388,6 +394,40 @@ void HIPInternal::finalize() {
     m_team_scratch_current_size = 0;
     m_team_scratch_ptr          = nullptr;
   }
+  if (nullptr != d_driverWorkArray) {
+    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    d_driverWorkArray = nullptr;
+  }
+}
+
+char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
+  std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
+  if (d_driverWorkArray == nullptr) {
+    HIP_SAFE_CALL(
+        hipHostMalloc(&d_driverWorkArray,
+                      m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
+                      hipHostMallocNonCoherent));
+  }
+  if (driverTypeSize > m_maxDriverTypeSize) {
+    // fence handles the cycle id reset for us
+    fence();
+    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    m_maxDriverTypeSize = driverTypeSize;
+    if (m_maxDriverTypeSize % 128 != 0)
+      m_maxDriverTypeSize =
+          m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
+    HIP_SAFE_CALL(
+        hipHostMalloc(&d_driverWorkArray,
+                      m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
+                      hipHostMallocNonCoherent));
+  } else {
+    m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
+    if (m_cycleId == 0) {
+      // ensure any outstanding kernels are completed before we wrap around
+      fence();
+    }
+  }
+  return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
 }
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index 07ec8625e6932647c0601fa8423354e25522321f..f4f88628e313a2d22d23a09e4ce25630d242a566 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -49,6 +49,8 @@
 
 #include <Kokkos_HIP_Space.hpp>
 
+#include <mutex>
+
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -83,33 +85,46 @@ class HIPInternal {
  public:
   using size_type = ::Kokkos::Experimental::HIP::size_type;
 
-  int m_hipDev;
-  int m_hipArch;
-  unsigned m_multiProcCount;
-  unsigned m_maxWarpCount;
-  unsigned m_maxBlock;
-  unsigned m_maxBlocksPerSM;
-  unsigned m_maxSharedWords;
+  int m_hipDev              = -1;
+  int m_hipArch             = -1;
+  unsigned m_multiProcCount = 0;
+  unsigned m_maxWarpCount   = 0;
+  unsigned m_maxBlock       = 0;
+  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxSharedWords = 0;
   int m_regsPerSM;
-  int m_shmemPerSM;
-  int m_maxShmemPerBlock;
-  int m_maxThreadsPerSM;
+  int m_shmemPerSM       = 0;
+  int m_maxShmemPerBlock = 0;
+  int m_maxThreadsPerSM  = 0;
+
+  // array of DriverTypes to be allocated in host-pinned memory for async
+  // kernel launches
+  mutable char *d_driverWorkArray = nullptr;
+  // number of kernel launches that can be in-flight w/o synchronization
+  const int m_maxDriverCycles = 100;
+  // max size of a DriverType [bytes]
+  mutable size_t m_maxDriverTypeSize = 1024 * 10;
+  // the current index in the driverWorkArray
+  mutable int m_cycleId = 0;
+  // mutex to access d_driverWorkArray
+  mutable std::mutex m_mutexWorkArray;
 
   // Scratch Spaces for Reductions
-  size_type m_scratchSpaceCount;
-  size_type m_scratchFlagsCount;
+  size_type m_scratchSpaceCount = 0;
+  size_type m_scratchFlagsCount = 0;
 
-  size_type *m_scratchSpace;
-  size_type *m_scratchFlags;
+  size_type *m_scratchSpace           = nullptr;
+  size_type *m_scratchFlags           = nullptr;
   uint32_t *m_scratchConcurrentBitset = nullptr;
 
   hipDeviceProp_t m_deviceProp;
 
-  hipStream_t m_stream;
+  hipStream_t m_stream = nullptr;
 
   // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void *m_team_scratch_ptr;
+  mutable int64_t m_team_scratch_current_size = 0;
+  mutable void *m_team_scratch_ptr            = nullptr;
+  mutable std::mutex m_team_scratch_mutex;
 
   bool was_finalized = false;
 
@@ -117,9 +132,7 @@ class HIPInternal {
 
   int verify_is_initialized(const char *const label) const;
 
-  int is_initialized() const {
-    return m_hipDev >= 0;
-  }  // 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+  int is_initialized() const { return m_hipDev >= 0; }
 
   void initialize(int hip_device_id, hipStream_t stream = nullptr);
   void finalize();
@@ -128,25 +141,12 @@ class HIPInternal {
 
   void fence() const;
 
+  // returns the next driver type pointer in our work array
+  char *get_next_driver(size_t driverTypeSize) const;
+
   ~HIPInternal();
 
-  HIPInternal()
-      : m_hipDev(-1),
-        m_hipArch(-1),
-        m_multiProcCount(0),
-        m_maxWarpCount(0),
-        m_maxBlock(0),
-        m_maxSharedWords(0),
-        m_shmemPerSM(0),
-        m_maxShmemPerBlock(0),
-        m_maxThreadsPerSM(0),
-        m_scratchSpaceCount(0),
-        m_scratchFlagsCount(0),
-        m_scratchSpace(nullptr),
-        m_scratchFlags(nullptr),
-        m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+  HIPInternal() = default;
 
   // Resizing of reduction related scratch spaces
   size_type *scratch_space(const size_type size);
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index 3e972c7346b839abc0efa69533236a29f97ed3d4..f774423b378b0753a98c9e4df512b599910028dd 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -49,9 +49,9 @@
 
 #if defined(__HIPCC__)
 
-#include <Kokkos_HIP_Space.hpp>
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
+#include <Kokkos_HIP_Space.hpp>
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -127,93 +127,87 @@ struct HIPDispatchProperties {
   HIPLaunchMechanism launch_mechanism = l;
 };
 
-template <class DriverType, class LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
-struct HIPParallelLaunch;
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFunc;
 
-template <class DriverType, unsigned int MaxThreadsPerBlock,
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
           unsigned int MinBlocksPerSM>
-struct HIPParallelLaunch<
+struct HIPParallelLaunchKernelFunc<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
     HIPLaunchMechanism::LocalMemory> {
-  inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
-                           const dim3 &block, const int shmem,
-                           const HIPInternal *hip_instance,
-                           const bool /*prefer_shmem*/) {
-    if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
-      if (hip_instance->m_maxShmemPerBlock < shmem) {
-        Kokkos::Impl::throw_runtime_exception(
-            "HIPParallelLaunch FAILED: shared memory request is too large");
-      }
-
-      KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
-
-      // FIXME_HIP -- there is currently an error copying (some) structs
-      // by value to the device in HIP-Clang / VDI
-      // As a workaround, we can malloc the DriverType and explictly copy over.
-      // To remove once solved in HIP
-      DriverType *d_driver;
-      HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType)));
-      HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType),
-                                   hipMemcpyHostToDevice,
-                                   hip_instance->m_stream));
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>
-          <<<grid, block, shmem, hip_instance->m_stream>>>(d_driver);
+  static auto get_kernel_func() {
+    return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
+                                            MinBlocksPerSM>;
+  }
+};
 
-#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
-#endif
-      HIP_SAFE_CALL(hipFree(d_driver));
-    }
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::LocalMemory> {
+  static auto get_kernel_func() {
+    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
   }
+};
 
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr,
-          reinterpret_cast<void const *>(
-              hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                               MinBlocksPerSM>)));
-      return attr;
-    }();
-    return attr;
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelInvoker;
+
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::LocalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::LocalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::LocalMemory>;
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
   }
 };
 
-template <class DriverType>
-struct HIPParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
-                         HIPLaunchMechanism::LocalMemory> {
-  inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
-                           const dim3 &block, const int shmem,
-                           const HIPInternal *hip_instance,
-                           const bool /*prefer_shmem*/) {
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+struct HIPParallelLaunch;
+
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunch<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::LocalMemory>
+    : HIPParallelLaunchKernelInvoker<
+          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+          HIPLaunchMechanism::LocalMemory> {
+  using base_t = HIPParallelLaunchKernelInvoker<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
+
+  HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
+                    const dim3 &block, const int shmem,
+                    const HIPInternal *hip_instance,
+                    const bool /*prefer_shmem*/) {
     if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
       if (hip_instance->m_maxShmemPerBlock < shmem) {
-        Kokkos::Impl::throw_runtime_exception(std::string(
-            "HIPParallelLaunch FAILED: shared memory request is too large"));
+        Kokkos::Impl::throw_runtime_exception(
+            "HIPParallelLaunch FAILED: shared memory request is too large");
       }
 
       KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
 
       // Invoke the driver function on the device
-
-      // FIXME_HIP -- see note about struct copy by value above
-      DriverType *d_driver;
-      HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType)));
-      HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType),
-                                   hipMemcpyHostToDevice,
-                                   hip_instance->m_stream));
-      hip_parallel_launch_local_memory<DriverType, 1024, 1>
-          <<<grid, block, shmem, hip_instance->m_stream>>>(d_driver);
+      DriverType *d_driver = reinterpret_cast<DriverType *>(
+          hip_instance->get_next_driver(sizeof(DriverType)));
+      std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType));
+      base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
       HIP_SAFE_CALL(hipGetLastError());
       hip_instance->fence();
 #endif
-      HIP_SAFE_CALL(hipFree(d_driver));
     }
   }
 
@@ -221,8 +215,7 @@ struct HIPParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
     static hipFuncAttributes attr = []() {
       hipFuncAttributes attr;
       HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(
-                     hip_parallel_launch_local_memory<DriverType, 1024, 1>)));
+          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
       return attr;
     }();
     return attr;
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce1aff9586d25911104d17d53860409f3e73b10b
--- /dev/null
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@@ -0,0 +1,37 @@
+#ifndef KOKKOS_HIP_MDRANGEPOLICY_HPP_
+#define KOKKOS_HIP_MDRANGEPOLICY_HPP_
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+
+template <>
+struct default_outer_direction<Kokkos::Experimental::HIP> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+template <>
+struct default_inner_direction<Kokkos::Experimental::HIP> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+namespace Impl {
+
+// Settings for MDRangePolicy
+template <>
+inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
+    const Kokkos::Experimental::HIP& space) {
+  TileSizeProperties properties;
+  properties.max_threads =
+      space.impl_internal_space_instance()->m_maxThreadsPerSM;
+  properties.default_largest_tile_size = 16;
+  properties.default_tile_size         = 4;
+  properties.max_total_tile_size       = 1024;
+  return properties;
+}
+
+}  // Namespace Impl
+}  // Namespace Kokkos
+#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 6b831ff7a3dd82d9d8a54ccc6f6f759548f5a65f..35e7d6fb853ae9e4f245e0fe0c2a71f4f2d4d6c2 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -49,6 +49,7 @@
 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
 #include <HIP/Kokkos_HIP_ReduceScan.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <impl/KokkosExp_IterateTileGPU.hpp>
 #include <Kokkos_Parallel.hpp>
 
 namespace Kokkos {
@@ -72,7 +73,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   ParallelFor& operator=(ParallelFor const&) = delete;
 
  public:
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
                                     typename Policy::work_tag>(m_policy,
                                                                m_functor)
@@ -175,6 +176,25 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                    Kokkos::Experimental::HIP>;
+    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
+        closure_type, LaunchBounds>::get_hip_func_attributes();
+    auto const& prop = pol.space().hip_device_prop();
+    // Limits due to registers/SM, MDRange doesn't have
+    // shared memory constraints
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(
+            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+  }
 };
 
 // ParallelReduce
@@ -231,7 +251,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     DeviceIteratePattern(m_policy, m_functor, update).exec_range();
   }
 
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(
@@ -291,13 +311,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
     int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
         false, FunctorType, WorkTag>(f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
+        closure_type, LaunchBounds>::get_hip_func_attributes();
     while (
         (n &&
          (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
           shmem_size)) ||
-        (n > static_cast<unsigned>(
-                 ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-                     ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
+        (n >
+         static_cast<unsigned>(
+             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
+                                                                  LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
       n >>= 1;
       shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
           false, FunctorType, WorkTag>(f, n);
@@ -391,6 +417,23 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                   memory_space>::accessible),
         m_scratch_space(nullptr),
         m_scratch_flags(nullptr) {}
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    using closure_type =
+        ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                       ReducerType, Kokkos::Experimental::HIP>;
+    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
+        closure_type, LaunchBounds>::get_hip_func_attributes();
+    auto const& prop = pol.space().hip_device_prop();
+    // Limits due do registers/SM
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(
+            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+  }
 };
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 5607f1c91a5da80cfe6111f28476dc8610e30160..7d2825eeb4c6be1d060d1e8d7c3eb67097729ccf 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -92,7 +92,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
  public:
   using functor_type = FunctorType;
 
-  inline __device__ void operator()(void) const {
+  inline __device__ void operator()() const {
     const Member work_stride = blockDim.y * gridDim.x;
     const Member work_end    = m_policy.end();
 
@@ -174,11 +174,14 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   size_type* m_scratch_space = nullptr;
   size_type* m_scratch_flags = nullptr;
 
-  // FIXME_HIP_PERFORMANCE Need a rule to choose when to use shared memory and
-  // when to use shuffle
+#if HIP_VERSION < 401
   static bool constexpr UseShflReduction =
       ((sizeof(value_type) > 2 * sizeof(double)) &&
        static_cast<bool>(ValueTraits::StaticValueSize));
+#else
+  static bool constexpr UseShflReduction =
+      static_cast<bool>(ValueTraits::StaticValueSize);
+#endif
 
  private:
   struct ShflReductionTag {};
@@ -330,13 +333,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     int shmem_size =
         hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
             f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
+        closure_type, LaunchBounds>::get_hip_func_attributes();
     while (
         (n &&
          (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
           shmem_size)) ||
-        (n > static_cast<unsigned int>(
-                 Kokkos::Experimental::Impl::hip_get_max_block_size<
-                     ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
+        (n >
+         static_cast<unsigned int>(
+             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
+                                                                  LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
       n >>= 1;
       shmem_size =
           hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
@@ -493,7 +502,7 @@ class ParallelScanHIPBase {
 
   //----------------------------------------
 
-  __device__ inline void initial(void) const {
+  __device__ inline void initial() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -529,7 +538,7 @@ class ParallelScanHIPBase {
 
   //----------------------------------------
 
-  __device__ inline void final(void) const {
+  __device__ inline void final() const {
     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
                                                    sizeof(size_type)>
         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
@@ -606,7 +615,7 @@ class ParallelScanHIPBase {
  public:
   //----------------------------------------
 
-  __device__ inline void operator()(void) const {
+  __device__ inline void operator()() const {
     if (!m_final) {
       initial();
     } else {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 5da83d289e2f1fa0c30dbddd3e9dd8d47c571af1..96c3ff2a751027a4eb05b03c99487207c9acf708 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -433,6 +433,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  // Only let one ParallelFor/Reduce modify the team scratch memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_scratch_lock;
 
   template <typename TagType>
   __device__ inline
@@ -449,7 +452,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
  public:
-  __device__ inline void operator()(void) const {
+  __device__ inline void operator()() const {
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
@@ -513,7 +516,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
+        m_vector_size(arg_policy.impl_vector_length()),
+        m_scratch_lock(m_policy.space()
+                           .impl_internal_space_instance()
+                           ->m_team_scratch_mutex) {
     hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
         ParallelFor, launch_bounds>::get_hip_func_attributes();
     m_team_size =
@@ -640,6 +646,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
+  // Only let one ParallelFor/Reduce modify the team scratch memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_scratch_lock;
 
   template <class TagType>
   __device__ inline
@@ -877,7 +886,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_ptr{nullptr, nullptr},
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
+        m_vector_size(arg_policy.impl_vector_length()),
+        m_scratch_lock(m_policy.space()
+                           .impl_internal_space_instance()
+                           ->m_team_scratch_mutex) {
     hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
         ParallelReduce, launch_bounds>::get_hip_func_attributes();
     m_team_size =
@@ -976,7 +988,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_ptr{nullptr, nullptr},
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
+        m_vector_size(arg_policy.impl_vector_length()),
+        m_scratch_lock(m_policy.space()
+                           .impl_internal_space_instance()
+                           ->m_team_scratch_mutex) {
     hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
         ParallelReduce, launch_bounds>::get_hip_func_attributes();
     m_team_size =
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 00cef28f826d05befc14925f9b58bbf095a097c0..15ca089d14740b6a2c42c69945a17a0c7bfa1bcc 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -42,12 +42,6 @@
 //@HEADER
 */
 
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <algorithm>
-#include <atomic>
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_Core.hpp>
@@ -57,6 +51,13 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
 
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <atomic>
+
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
@@ -172,14 +173,14 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 
 namespace Kokkos {
 
-void Experimental::HIPSpace::access_error() {
+KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
   const std::string msg(
       "Kokkos::Experimental::HIPSpace::access_error attempt to execute "
       "Experimental::HIP function from non-HIP space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
 
-void Experimental::HIPSpace::access_error(const void* const) {
+KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
   const std::string msg(
       "Kokkos::Experimental::HIPSpace::access_error attempt to execute "
       "Experimental::HIP function from non-HIP space");
@@ -326,45 +327,6 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record;
 #endif
 
-std::string SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
-                                   void>::get_label() const {
-  SharedAllocationHeader header;
-
-  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>(
-      &header, RecordBase::head(), sizeof(SharedAllocationHeader));
-
-  return std::string(header.m_label);
-}
-
-std::string SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace,
-                                   void>::get_label() const {
-  return std::string(RecordBase::head()->m_label);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::allocate(
-    const Kokkos::Experimental::HIPSpace& arg_space,
-    const std::string& arg_label, const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
-    allocate(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
-             const std::string& arg_label, const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::deallocate(
-    SharedAllocationRecord<void, void>* arg_rec) {
-  delete static_cast<SharedAllocationRecord*>(arg_rec);
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
-    deallocate(SharedAllocationRecord<void, void>* arg_rec) {
-  delete static_cast<SharedAllocationRecord*>(arg_rec);
-}
-
 SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
                        void>::~SharedAllocationRecord() {
   const char* label = nullptr;
@@ -393,7 +355,7 @@ SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
                                   void>::s_root_record,
@@ -405,13 +367,7 @@ SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
 
   SharedAllocationHeader header;
 
-  // Fill in the Header information
-  header.m_record = static_cast<SharedAllocationRecord<void, void>*>(this);
-
-  strncpy(header.m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
 
   // Copy to device memory
   Kokkos::Impl::DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace>(
@@ -425,7 +381,7 @@ SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace,
                                   void>::s_root_record,
@@ -435,223 +391,8 @@ SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
       m_space(arg_space) {
   // Fill in the Header information, directly accessible via host pinned memory
-
-  RecordBase::m_alloc_ptr->m_record = this;
-
-  strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
-}
-
-//----------------------------------------------------------------------------
-
-void* SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
-    allocate_tracked(const Kokkos::Experimental::HIPSpace& arg_space,
-                     const std::string& arg_alloc_label,
-                     const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord* const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
-                            void>::deallocate_tracked(void* const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord* const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void* SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
-    reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord* const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<Kokkos::Experimental::HIPSpace,
-                         Kokkos::Experimental::HIPSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-void* SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
-    allocate_tracked(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
-                     const std::string& arg_alloc_label,
-                     const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord* const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace,
-                            void>::deallocate_tracked(void* const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr) {
-    SharedAllocationRecord* const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void* SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
-    reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord* const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  using HIPHostPinnedSpace = Kokkos::Experimental::HIPHostPinnedSpace;
-  Kokkos::Impl::DeepCopy<HIPHostPinnedSpace, HIPHostPinnedSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
-}
-
-//----------------------------------------------------------------------------
-
-SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::get_record(
-    void* alloc_ptr) {
-  using Header = SharedAllocationHeader;
-  using RecordHIP =
-      SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>;
-
-  // Copy the header from the allocation
-  Header head;
-
-  Header const* const head_hip =
-      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
-
-  if (alloc_ptr) {
-    Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
-        &head, head_hip, sizeof(SharedAllocationHeader));
-  }
-
-  RecordHIP* const record =
-      alloc_ptr ? static_cast<RecordHIP*>(head.m_record) : nullptr;
-
-  if (!alloc_ptr || record->m_alloc_ptr != head_hip) {
-    Kokkos::Impl::throw_runtime_exception(std::string(
-        "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HIPSpace "
-        ", void >::get_record ERROR"));
-  }
-
-  return record;
-}
-
-SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace,
-                       void>::get_record(void* alloc_ptr) {
-  using Header = SharedAllocationHeader;
-  using RecordHIP =
-      SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>;
-
-  Header* const h =
-      alloc_ptr ? reinterpret_cast<Header*>(alloc_ptr) - 1 : nullptr;
-
-  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
-    Kokkos::Impl::throw_runtime_exception(std::string(
-        "Kokkos::Impl::SharedAllocationRecord< "
-        "Kokkos::Experimental::HIPHostPinnedSpace , void >::get_record ERROR"));
-  }
-
-  return static_cast<RecordHIP*>(h->m_record);
-}
-
-// Iterate records to print orphaned memory ...
-void SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
-    print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&,
-                  bool detail) {
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>* r = &s_root_record;
-
-  char buffer[256];
-
-  SharedAllocationHeader head;
-
-  if (detail) {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-      } else {
-        head.m_label[0] = 0;
-      }
-
-      // Formatting dependent on sizeof(uintptr_t)
-      const char* format_string;
-
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-        format_string =
-            "HIP addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + "
-            "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
-      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-        format_string =
-            "HIP addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ "
-            "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
-      }
-
-      snprintf(buffer, 256, format_string, reinterpret_cast<uintptr_t>(r),
-               reinterpret_cast<uintptr_t>(r->m_prev),
-               reinterpret_cast<uintptr_t>(r->m_next),
-               reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size,
-               r->m_count, reinterpret_cast<uintptr_t>(r->m_dealloc),
-               head.m_label);
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  } else {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-
-        // Formatting dependent on sizeof(uintptr_t)
-        const char* format_string;
-
-        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-          format_string = "HIP [ 0x%.12lx + %ld ] %s\n";
-        } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-          format_string = "HIP [ 0x%.12llx + %ld ] %s\n";
-        }
-
-        snprintf(buffer, 256, format_string,
-                 reinterpret_cast<uintptr_t>(r->data()), r->size(),
-                 head.m_label);
-      } else {
-        snprintf(buffer, 256, "HIP [ 0 + 0 ]\n");
-      }
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  }
-#else
-  (void)s;
-  (void)detail;
-  throw_runtime_exception(
-      "Kokkos::Impl::SharedAllocationRecord<HIPSpace>::print_records"
-      " only works with KOKKOS_ENABLE_DEBUG enabled");
-#endif
+  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
+                                                  arg_label);
 }
 
 }  // namespace Impl
@@ -680,63 +421,22 @@ void HIP::impl_initialize(const HIP::SelectDevice config) {
 void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); }
 
 HIP::HIP()
-    : m_space_instance(&Impl::HIPInternal::singleton()), m_counter(nullptr) {
+    : m_space_instance(&Impl::HIPInternal::singleton(),
+                       [](Impl::HIPInternal*) {}) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
 }
 
 HIP::HIP(hipStream_t const stream)
-    : m_space_instance(new Impl::HIPInternal), m_counter(new int(1)) {
+    : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
+        ptr->finalize();
+        delete ptr;
+      }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
   m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
 }
 
-KOKKOS_FUNCTION HIP::HIP(HIP&& other) noexcept {
-  m_space_instance       = other.m_space_instance;
-  other.m_space_instance = nullptr;
-  m_counter              = other.m_counter;
-  other.m_counter        = nullptr;
-}
-
-KOKKOS_FUNCTION HIP::HIP(HIP const& other)
-    : m_space_instance(other.m_space_instance), m_counter(other.m_counter) {
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU
-  if (m_counter) Kokkos::atomic_add(m_counter, 1);
-#endif
-}
-
-KOKKOS_FUNCTION HIP& HIP::operator=(HIP&& other) noexcept {
-  m_space_instance       = other.m_space_instance;
-  other.m_space_instance = nullptr;
-  m_counter              = other.m_counter;
-  other.m_counter        = nullptr;
-
-  return *this;
-}
-
-KOKKOS_FUNCTION HIP& HIP::operator=(HIP const& other) {
-  m_space_instance = other.m_space_instance;
-  m_counter        = other.m_counter;
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU
-  if (m_counter) Kokkos::atomic_add(m_counter, 1);
-#endif
-
-  return *this;
-}
-
-KOKKOS_FUNCTION HIP::~HIP() noexcept {
-#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU
-  if (m_counter == nullptr) return;
-  int const count = Kokkos::atomic_fetch_sub(m_counter, 1);
-  if (count == 1) {
-    delete m_counter;
-    m_space_instance->finalize();
-    delete m_space_instance;
-  }
-#endif
-}
-
 void HIP::print_configuration(std::ostream& s, const bool) {
   Impl::HIPInternal::singleton().print_configuration(s);
 }
@@ -810,3 +510,26 @@ void HIPSpaceInitializer::print_configuration(std::ostream& msg,
 
 }  // namespace Impl
 }  // namespace Kokkos
+
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicity instantiate these CRTP base classes here,
+// where we have access to the associated *_timpl.hpp header files.
+template class HostInaccessibleSharedAllocationRecordCommon<
+    Kokkos::Experimental::HIPSpace>;
+template class SharedAllocationRecordCommon<Kokkos::Experimental::HIPSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::HIPHostPinnedSpace>;
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index 7571510c31fa6d082017150120cf6ef67e83a321..fe52886ced7c7a72454f9e731b3b5b4778f90073 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -644,13 +644,14 @@ KOKKOS_INLINE_FUNCTION
       thread, count);
 }
 
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>
-    ThreadVectorRange(const Impl::HIPTeamMember& thread, iType arg_begin,
-                      iType arg_end) {
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember>
+ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin,
+                  iType2 arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>(
-      thread, arg_begin, arg_end);
+      thread, iType(arg_begin), iType(arg_end));
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -961,7 +962,7 @@ KOKKOS_INLINE_FUNCTION
 
 //----------------------------------------------------------------------------
 
-/** \brief  Intra-thread vector parallel exclusive prefix sum.
+/** \brief  Intra-thread vector parallel scan with reducer.
  *
  *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
  *
@@ -969,22 +970,21 @@ KOKKOS_INLINE_FUNCTION
  *  thread and a scan operation is performed.
  *  The last call to closure has final == true.
  */
-template <typename iType, class Closure>
-KOKKOS_INLINE_FUNCTION void parallel_scan(
-    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>&
-        loop_boundaries,
-    const Closure& closure) {
+template <typename iType, class Closure, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                      iType, Impl::HIPTeamMember>& loop_boundaries,
+                  const Closure& closure, const ReducerType& reducer) {
 #ifdef __HIP_DEVICE_COMPILE__
-  // Extract value_type from closure
-
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+  using value_type = typename ReducerType::value_type;
+  value_type accum;
+  reducer.init(accum);
+  const value_type identity = accum;
 
   // Loop through boundaries by vector-length chunks
   // must scan at each iteration
 
-  value_type accum = 0;
-
   // All thread "lanes" must loop the same number of times.
   // Determine an loop end for all thread "lanes."
   // Requires:
@@ -997,47 +997,72 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   const int end  = loop_boundaries.end + (rem ? blockDim.x - rem : 0);
 
   for (int i = threadIdx.x; i < end; i += blockDim.x) {
-    value_type val = 0;
-
-    // First acquire per-lane contributions:
-    if (i < loop_boundaries.end) closure(i, val, false);
+    value_type val = identity;
 
-    value_type sval = val;
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution
+    // to make the latter in_place_shfl_up an
+    // exclusive scan -- the final accumulation
+    // of i's val will be included in the second
+    // closure call later.
+    if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false);
 
-    // Bottom up inclusive scan in triangular pattern
+    // Bottom up exclusive scan in triangular pattern
     // where each HIP thread is the root of a reduction tree
     // from the zeroth "lane" to itself.
     //  [t] += [t-1] if t >= 1
     //  [t] += [t-2] if t >= 2
     //  [t] += [t-4] if t >= 4
     //  ...
-
+    //  This differs from the non-reducer overload, where an inclusive scan was
+    //  implemented, because in general the binary operator cannot be inverted
+    //  and we would not be able to remove the inclusive contribution by
+    //  inversion.
     for (int j = 1; j < static_cast<int>(blockDim.x); j <<= 1) {
-      value_type tmp = 0;
-      ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, sval, j, blockDim.x);
+      value_type tmp = identity;
+      ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, val, j, blockDim.x);
       if (j <= static_cast<int>(threadIdx.x)) {
-        sval += tmp;
+        reducer.join(val, tmp);
       }
     }
 
-    // Include accumulation and remove value for exclusive scan:
-    val = accum + sval - val;
+    // Include accumulation
+    reducer.join(val, accum);
 
-    // Provide exclusive scan value:
+    // Update i's contribution into the val
+    // and add it to accum for next round
     if (i < loop_boundaries.end) closure(i, val, true);
-
-    // Accumulate the last value in the inclusive scan:
-    ::Kokkos::Experimental::Impl::in_place_shfl(sval, sval, blockDim.x - 1,
+    ::Kokkos::Experimental::Impl::in_place_shfl(accum, val, blockDim.x - 1,
                                                 blockDim.x);
-
-    accum += sval;
   }
 #else
   (void)loop_boundaries;
   (void)closure;
+  (void)reducer;
 #endif
 }
 
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+  value_type dummy;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy));
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index 140376425c2910c2d50a73d68f4fee27e57ee8cf..b7d8e62f696073bfa4794b362401aaca288de021 100644
--- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -48,17 +48,11 @@
 #include <initializer_list>
 
 #include <Kokkos_Layout.hpp>
-
+#include <Kokkos_Array.hpp>
 #include <impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
-#include <Kokkos_Parallel.hpp>
 #include <type_traits>
 
-#if defined(KOKKOS_ENABLE_CUDA) || \
-    (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP))
-#include <impl/KokkosExp_IterateTileGPU.hpp>
-#endif
-
 namespace Kokkos {
 
 // ------------------------------------------------------------------ //
@@ -74,22 +68,14 @@ enum class Iterate
 
 template <typename ExecSpace>
 struct default_outer_direction {
-  using type = Iterate;
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
-  static constexpr Iterate value = Iterate::Left;
-#else
+  using type                     = Iterate;
   static constexpr Iterate value = Iterate::Right;
-#endif
 };
 
 template <typename ExecSpace>
 struct default_inner_direction {
-  using type = Iterate;
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
-  static constexpr Iterate value = Iterate::Left;
-#else
+  using type                     = Iterate;
   static constexpr Iterate value = Iterate::Right;
-#endif
 };
 
 // Iteration Pattern
@@ -179,6 +165,25 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing(
   }
   return a;
 }
+
+struct TileSizeProperties {
+  int max_threads;
+  int default_largest_tile_size;
+  int default_tile_size;
+  int max_total_tile_size;
+};
+
+template <typename ExecutionSpace>
+TileSizeProperties get_tile_size_properties(const ExecutionSpace&) {
+  // Host settings
+  TileSizeProperties properties;
+  properties.max_threads               = std::numeric_limits<int>::max();
+  properties.default_largest_tile_size = 0;
+  properties.default_tile_size         = 2;
+  properties.max_total_tile_size       = std::numeric_limits<int>::max();
+  return properties;
+}
+
 }  // namespace Impl
 
 // multi-dimensional iteration pattern
@@ -208,7 +213,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   using launch_bounds     = typename traits::launch_bounds;
   using member_type       = typename range_policy::member_type;
 
-  enum { rank = static_cast<int>(iteration_pattern::rank) };
+  static constexpr int rank = iteration_pattern::rank;
 
   using index_type       = typename traits::index_type;
   using array_index_type = std::int64_t;
@@ -231,37 +236,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   point_type m_tile_end       = {};
   index_type m_num_tiles      = 1;
   index_type m_prod_tile_dims = 1;
+  bool m_tune_tile_size       = false;
 
-  /*
-    // NDE enum impl definition alternative - replace static constexpr int ?
-    enum { outer_direction = static_cast<int> (
-        (iteration_pattern::outer_direction != Iterate::Default)
-      ? iteration_pattern::outer_direction
-      : default_outer_direction< typename traits::execution_space>::value ) };
-
-    enum { inner_direction = static_cast<int> (
-        iteration_pattern::inner_direction != Iterate::Default
-      ? iteration_pattern::inner_direction
-      : default_inner_direction< typename traits::execution_space>::value ) };
-
-    enum { Right = static_cast<int>( Iterate::Right ) };
-    enum { Left  = static_cast<int>( Iterate::Left ) };
-  */
-  // static constexpr int rank = iteration_pattern::rank;
-
-  static constexpr int outer_direction = static_cast<int>(
+  static constexpr auto outer_direction =
       (iteration_pattern::outer_direction != Iterate::Default)
           ? iteration_pattern::outer_direction
-          : default_outer_direction<typename traits::execution_space>::value);
+          : default_outer_direction<typename traits::execution_space>::value;
 
-  static constexpr int inner_direction = static_cast<int>(
+  static constexpr auto inner_direction =
       iteration_pattern::inner_direction != Iterate::Default
           ? iteration_pattern::inner_direction
-          : default_inner_direction<typename traits::execution_space>::value);
+          : default_inner_direction<typename traits::execution_space>::value;
 
-  // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Right = static_cast<int>(Iterate::Right);
-  static constexpr int Left  = static_cast<int>(Iterate::Left);
+  static constexpr auto Right = Iterate::Right;
+  static constexpr auto Left  = Iterate::Left;
 
   KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const {
     return m_space;
@@ -320,7 +308,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
                 point_type const& lower, point_type const& upper,
                 tile_type const& tile = tile_type{})
       : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) {
-    init();
+    init_helper(Impl::get_tile_size_properties(work_space));
   }
 
   template <typename T, std::size_t NT = rank,
@@ -354,93 +342,56 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
         m_tile(p.m_tile),
         m_tile_end(p.m_tile_end),
         m_num_tiles(p.m_num_tiles),
-        m_prod_tile_dims(p.m_prod_tile_dims) {}
+        m_prod_tile_dims(p.m_prod_tile_dims),
+        m_tune_tile_size(p.m_tune_tile_size) {}
+
+  void impl_change_tile_size(const point_type& tile) {
+    m_tile = tile;
+    init_helper(Impl::get_tile_size_properties(m_space));
+  }
+  bool impl_tune_tile_size() const { return m_tune_tile_size; }
 
  private:
-  void init() {
-    // Host
-    if (true
-#if defined(KOKKOS_ENABLE_CUDA)
-        && !std::is_same<typename traits::execution_space, Kokkos::Cuda>::value
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-        && !std::is_same<typename traits::execution_space,
-                         Kokkos::Experimental::HIP>::value
-#endif
-    ) {
-      index_type span;
-      for (int i = 0; i < rank; ++i) {
-        span = m_upper[i] - m_lower[i];
-        if (m_tile[i] <= 0) {
-          if (((int)inner_direction == (int)Right && (i < rank - 1)) ||
-              ((int)inner_direction == (int)Left && (i > 0))) {
-            m_tile[i] = 2;
-          } else {
-            m_tile[i] = (span == 0 ? 1 : span);
-          }
-        }
-        m_tile_end[i] =
-            static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
-        m_num_tiles *= m_tile_end[i];
-        m_prod_tile_dims *= m_tile[i];
-      }
+  void init_helper(Impl::TileSizeProperties properties) {
+    m_prod_tile_dims = 1;
+    int increment    = 1;
+    int rank_start   = 0;
+    int rank_end     = rank;
+    if (inner_direction == Iterate::Right) {
+      increment  = -1;
+      rank_start = rank - 1;
+      rank_end   = -1;
     }
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
-    else  // Cuda or HIP
-    {
-      index_type span;
-      int increment  = 1;
-      int rank_start = 0;
-      int rank_end   = rank;
-      if ((int)inner_direction == (int)Right) {
-        increment  = -1;
-        rank_start = rank - 1;
-        rank_end   = -1;
-      }
-      bool is_cuda_exec_space =
-#if defined(KOKKOS_ENABLE_CUDA)
-          std::is_same<typename traits::execution_space, Kokkos::Cuda>::value;
-#else
-          false;
-#endif
-      for (int i = rank_start; i != rank_end; i += increment) {
-        span = m_upper[i] - m_lower[i];
-        if (m_tile[i] <= 0) {
-          // TODO: determine what is a good default tile size for Cuda and HIP
-          // may be rank dependent
-          if (((int)inner_direction == (int)Right && (i < rank - 1)) ||
-              ((int)inner_direction == (int)Left && (i > 0))) {
-            if (m_prod_tile_dims < 256) {
-              m_tile[i] = (is_cuda_exec_space) ? 2 : 4;
-            } else {
-              m_tile[i] = 1;
-            }
+    for (int i = rank_start; i != rank_end; i += increment) {
+      const index_type length = m_upper[i] - m_lower[i];
+      if (m_tile[i] <= 0) {
+        m_tune_tile_size = true;
+        if ((inner_direction == Iterate::Right && (i < rank - 1)) ||
+            (inner_direction == Iterate::Left && (i > 0))) {
+          if (m_prod_tile_dims * properties.default_tile_size <
+              static_cast<index_type>(properties.max_total_tile_size)) {
+            m_tile[i] = properties.default_tile_size;
           } else {
-            m_tile[i] = 16;
+            m_tile[i] = 1;
           }
-        }
-        m_tile_end[i] =
-            static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
-        m_num_tiles *= m_tile_end[i];
-        m_prod_tile_dims *= m_tile[i];
-      }
-      if (m_prod_tile_dims >
-          1024) {  // Match Cuda restriction for ParallelReduce; 1024,1024,64
-                   // max per dim (Kepler), but product num_threads < 1024
-        if (is_cuda_exec_space) {
-          printf(" Tile dimensions exceed Cuda limits\n");
-          Kokkos::abort(
-              "Cuda ExecSpace Error: MDRange tile dims exceed maximum number "
-              "of threads per block - choose smaller tile dims");
         } else {
-          printf(" Tile dimensions exceed HIP limits\n");
-          Kokkos::abort(
-              "HIP ExecSpace Error: MDRange tile dims exceed maximum number of "
-              "threads per block - choose smaller tile dims");
+          m_tile[i] = properties.default_largest_tile_size == 0
+                          ? std::max<int>(length, 1)
+                          : properties.default_largest_tile_size;
         }
       }
+      m_tile_end[i] =
+          static_cast<index_type>((length + m_tile[i] - 1) / m_tile[i]);
+      m_num_tiles *= m_tile_end[i];
+      m_prod_tile_dims *= m_tile[i];
+    }
+    if (m_prod_tile_dims > static_cast<index_type>(properties.max_threads)) {
+      printf(" Product of tile dimensions exceed maximum limit: %d\n",
+             static_cast<int>(properties.max_threads));
+      Kokkos::abort(
+          "ExecSpace Error: MDRange tile dims exceed maximum number "
+          "of threads per block - choose smaller tile dims");
     }
-#endif
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
index 8e226a078d1adfa275ad5a8c0263dfeeb41c4787..fb94049d7ad7ed588b00cc1f9351162de32f08e5 100644
--- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
@@ -104,20 +104,6 @@ struct MemorySpaceAccess<Kokkos::AnonymousSpace, Kokkos::AnonymousSpace> {
   enum : bool { deepcopy = true };
 };
 
-template <typename OtherSpace>
-struct VerifyExecutionCanAccessMemorySpace<OtherSpace, Kokkos::AnonymousSpace> {
-  enum { value = 1 };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
-};
-
-template <typename OtherSpace>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::AnonymousSpace, OtherSpace> {
-  enum { value = 1 };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
-};
-
 }  // namespace Impl
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
index fb2925a066f545bce8636ea76aabc5794f78f587..6578723fc8e5dab1e605b1a5dc80f1daf4b2ebfb 100644
--- a/packages/kokkos/core/src/Kokkos_Complex.hpp
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -45,14 +45,13 @@
 #define KOKKOS_COMPLEX_HPP
 
 #include <Kokkos_Atomic.hpp>
+#include <Kokkos_MathematicalFunctions.hpp>
 #include <Kokkos_NumericTraits.hpp>
+#include <impl/Kokkos_Error.hpp>
 #include <complex>
+#include <type_traits>
 #include <iosfwd>
 
-#ifdef KOKKOS_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#endif
-
 namespace Kokkos {
 
 /// \class complex
@@ -220,10 +219,11 @@ class
   // Conditional noexcept, just in case RType throws on divide-by-zero
   KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
+    using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
-    const RealType s = std::fabs(y.real()) + std::fabs(y.imag());
+    const RealType s = fabs(y.real()) + fabs(y.imag());
 
     // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
     // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@@ -248,10 +248,11 @@ class
   KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
+    using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
-    const RealType s = std::fabs(y.real()) + std::fabs(y.imag());
+    const RealType s = fabs(y.real()) + fabs(y.imag());
 
     // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
     // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@@ -693,35 +694,96 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept {
   return x.real();
 }
 
+//! Constructs a complex number from magnitude and phase angle
+template <class T>
+KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::sin;
+  KOKKOS_EXPECTS(r >= 0);
+  return complex<T>(r * cos(theta), r * sin(theta));
+}
+
 //! Absolute value (magnitude) of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType abs(const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::hypot;
-#else
-  using std::hypot;
-#endif
+  using Kokkos::Experimental::hypot;
   return hypot(x.real(), x.imag());
 }
 
 //! Power of a complex number
-template <class RealType>
-KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> pow(const complex<RealType>& x,
-                                                     const RealType& e) {
-  RealType r = abs(x);
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::atan;
-  using cl::sycl::cos;
-  using cl::sycl::pow;
-  using cl::sycl::sin;
-#else
-  using std::atan;
-  using std::cos;
-  using std::pow;
-  using std::sin;
-#endif
-  RealType phi = atan(x.imag() / x.real());
-  return pow(r, e) * Kokkos::complex<RealType>(cos(phi * e), sin(phi * e));
+template <class T>
+KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x, const T& y) {
+  using Kokkos::Experimental::atan2;
+  using Kokkos::Experimental::pow;
+  T r     = abs(x);
+  T theta = atan2(x.imag(), x.real());
+  return polar(pow(r, y), y * theta);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION complex<T> pow(const T& x, const complex<T>& y) {
+  return pow(complex<T>(x), y);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
+                                      const complex<T>& y) {
+  using Kokkos::Experimental::log;
+
+  return x == T() ? T() : exp(y * log(x));
+}
+
+namespace Impl {
+// NOTE promote would also be useful for math functions
+template <class T, bool = std::is_integral<T>::value>
+struct promote {
+  using type = double;
+};
+template <class T>
+struct promote<T, false> {};
+template <>
+struct promote<long double> {
+  using type = long double;
+};
+template <>
+struct promote<double> {
+  using type = double;
+};
+template <>
+struct promote<float> {
+  using type = float;
+};
+template <class T>
+using promote_t = typename promote<T>::type;
+template <class T, class U>
+struct promote_2 {
+  using type = decltype(promote_t<T>() + promote_t<U>());
+};
+template <class T, class U>
+using promote_2_t = typename promote_2<T, U>::type;
+}  // namespace Impl
+
+template <class T, class U,
+          class = std::enable_if_t<std::is_arithmetic<T>::value>>
+KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
+    const T& x, const complex<U>& y) {
+  using type = Impl::promote_2_t<T, U>;
+  return pow(type(x), complex<type>(y));
+}
+
+template <class T, class U,
+          class = std::enable_if_t<std::is_arithmetic<U>::value>>
+KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(const complex<T>& x,
+                                                            const U& y) {
+  using type = Impl::promote_2_t<T, U>;
+  return pow(complex<type>(x), type(y));
+}
+
+template <class T, class U>
+KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
+    const complex<T>& x, const complex<U>& y) {
+  using type = Impl::promote_2_t<T, U>;
+  return pow(complex<type>(x), complex<type>(y));
 }
 
 //! Square root of a complex number. This is intended to match the stdc++
@@ -729,26 +791,21 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> pow(const complex<RealType>& x,
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sqrt(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::abs;
-  using cl::sycl::sqrt;
-#else
-  using std::abs;
-  using std::sqrt;
-#endif
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::sqrt;
 
   RealType r = x.real();
   RealType i = x.imag();
 
   if (r == RealType()) {
-    RealType t = sqrt(abs(i) / 2);
+    RealType t = sqrt(fabs(i) / 2);
     return Kokkos::complex<RealType>(t, i < RealType() ? -t : t);
   } else {
-    RealType t = sqrt(2 * (abs(x) + abs(r)));
+    RealType t = sqrt(2 * (abs(x) + fabs(r)));
     RealType u = t / 2;
-    return r > RealType()
-               ? Kokkos::complex<RealType>(u, i / t)
-               : Kokkos::complex<RealType>(abs(i) / t, i < RealType() ? -u : u);
+    return r > RealType() ? Kokkos::complex<RealType>(u, i / t)
+                          : Kokkos::complex<RealType>(fabs(i) / t,
+                                                      i < RealType() ? -u : u);
   }
 }
 
@@ -762,15 +819,9 @@ KOKKOS_INLINE_FUNCTION complex<RealType> conj(
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::cos;
-  using cl::sycl::exp;
-  using cl::sycl::sin;
-#else
-  using std::cos;
-  using std::exp;
-  using std::sin;
-#endif
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::sin;
   return exp(x.real()) * complex<RealType>(cos(x.imag()), sin(x.imag()));
 }
 
@@ -778,14 +829,9 @@ KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> log(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::atan;
-  using cl::sycl::log;
-#else
-  using std::atan;
-  using std::log;
-#endif
-  RealType phi = atan(x.imag() / x.real());
+  using Kokkos::Experimental::atan2;
+  using Kokkos::Experimental::log;
+  RealType phi = atan2(x.imag(), x.real());
   return Kokkos::complex<RealType>(log(abs(x)), phi);
 }
 
@@ -793,17 +839,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> log(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sin(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::cos;
-  using cl::sycl::cosh;
-  using cl::sycl::sin;
-  using cl::sycl::sinh;
-#else
-  using std::cos;
-  using std::cosh;
-  using std::sin;
-  using std::sinh;
-#endif
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::cosh;
+  using Kokkos::Experimental::sin;
+  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(sin(x.real()) * cosh(x.imag()),
                                    cos(x.real()) * sinh(x.imag()));
 }
@@ -812,17 +851,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sin(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cos(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::cos;
-  using cl::sycl::cosh;
-  using cl::sycl::sin;
-  using cl::sycl::sinh;
-#else
-  using std::cos;
-  using std::cosh;
-  using std::sin;
-  using std::sinh;
-#endif
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::cosh;
+  using Kokkos::Experimental::sin;
+  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(cos(x.real()) * cosh(x.imag()),
                                    -sin(x.real()) * sinh(x.imag()));
 }
@@ -838,17 +870,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> tan(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sinh(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::cos;
-  using cl::sycl::cosh;
-  using cl::sycl::sin;
-  using cl::sycl::sinh;
-#else
-  using std::cos;
-  using std::cosh;
-  using std::sin;
-  using std::sinh;
-#endif
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::cosh;
+  using Kokkos::Experimental::sin;
+  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(sinh(x.real()) * cos(x.imag()),
                                    cosh(x.real()) * sin(x.imag()));
 }
@@ -857,17 +882,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sinh(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cosh(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::cos;
-  using cl::sycl::cosh;
-  using cl::sycl::sin;
-  using cl::sycl::sinh;
-#else
-  using std::cos;
-  using std::cosh;
-  using std::sin;
-  using std::sinh;
-#endif
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::cosh;
+  using Kokkos::Experimental::sin;
+  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(cosh(x.real()) * cos(x.imag()),
                                    sinh(x.real()) * sin(x.imag()));
 }
@@ -898,13 +916,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acosh(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atanh(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::atan2;
-  using cl::sycl::log;
-#else
-  using std::atan2;
-  using std::log;
-#endif
+  using Kokkos::Experimental::atan2;
+  using Kokkos::Experimental::log;
 
   const RealType i2 = x.imag() * x.imag();
   const RealType r  = RealType(1.0) - i2 - x.real() * x.real();
@@ -933,12 +946,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> asin(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acos(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::acos;
-
-#else
-  using std::acos;
-#endif
+  using Kokkos::Experimental::acos;
   Kokkos::complex<RealType> t = asin(x);
   RealType pi_2               = acos(RealType(0.0));
   return Kokkos::complex<RealType>(pi_2 - t.real(), -t.imag());
@@ -948,13 +956,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acos(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atan(
     const complex<RealType>& x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
-  using cl::sycl::atan2;
-  using cl::sycl::log;
-#else
-  using std::atan2;
-  using std::log;
-#endif
+  using Kokkos::Experimental::atan2;
+  using Kokkos::Experimental::log;
   const RealType r2 = x.real() * x.real();
   const RealType i  = RealType(1.0) - r2 - x.imag() * x.imag();
 
@@ -996,12 +999,13 @@ KOKKOS_INLINE_FUNCTION
     operator/(const complex<RealType1>& x,
               const complex<RealType2>& y) noexcept(noexcept(RealType1{} /
                                                              RealType2{})) {
+  using Kokkos::Experimental::fabs;
   // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
   // If the real part is +/-Inf and the imaginary part is -/+Inf,
   // this won't change the result.
   using common_real_type =
       typename std::common_type<RealType1, RealType2>::type;
-  const common_real_type s = std::fabs(real(y)) + std::fabs(imag(y));
+  const common_real_type s = fabs(real(y)) + fabs(imag(y));
 
   // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
   // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@@ -1046,7 +1050,7 @@ std::istream& operator>>(std::istream& is, complex<RealType>& x) {
 }
 
 template <class T>
-struct reduction_identity<Kokkos::complex<T> > {
+struct reduction_identity<Kokkos::complex<T>> {
   using t_red_ident = reduction_identity<T>;
   KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T>
   sum() noexcept {
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
index 4dac463a667169472c94e5b2076b1c224019a379..c3771ab393f3aaf8f77cb474056d90e867ff03da 100644
--- a/packages/kokkos/core/src/Kokkos_Core.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -58,6 +58,7 @@
 #include <Kokkos_AnonymousSpace.hpp>
 #include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
+#include <Kokkos_MathematicalFunctions.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -86,6 +87,10 @@ struct InitArguments {
   int skip_device;
   bool disable_warnings;
   bool tune_internals;
+  bool tool_help        = false;
+  std::string tool_lib  = {};
+  std::string tool_args = {};
+
   InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false,
                 bool ti = false)
       : num_threads{nt},
@@ -139,6 +144,10 @@ void pre_initialize(const InitArguments& args);
 
 void post_initialize(const InitArguments& args);
 
+void declare_configuration_metadata(const std::string& category,
+                                    const std::string& key,
+                                    const std::string& value);
+
 }  // namespace Impl
 
 bool is_initialized() noexcept;
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
index 7502719c73d004e8f4bf79ac214209c210645c7b..fe7eba3f6ef178848d2ea832341014d6dc5d1003 100644
--- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -50,6 +50,7 @@
 // and compiler environment then sets a collection of #define macros.
 
 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
 #include <Kokkos_MasterLock.hpp>
@@ -180,7 +181,6 @@ using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION =
 // a given memory space.
 
 namespace Kokkos {
-
 namespace Impl {
 
 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \
@@ -196,16 +196,22 @@ using ActiveExecutionMemorySpace = Kokkos::HostSpace;
 using ActiveExecutionMemorySpace = void;
 #endif
 
-template <class ActiveSpace, class MemorySpace>
-struct VerifyExecutionCanAccessMemorySpace {
-  enum { value = 0 };
+template <typename DstMemorySpace, typename SrcMemorySpace>
+struct MemorySpaceAccess;
+
+template <typename DstMemorySpace, typename SrcMemorySpace,
+          bool = Kokkos::Impl::MemorySpaceAccess<DstMemorySpace,
+                                                 SrcMemorySpace>::accessible>
+struct verify_space {
+  KOKKOS_FUNCTION static void check() {}
 };
 
-template <class Space>
-struct VerifyExecutionCanAccessMemorySpace<Space, Space> {
-  enum { value = 1 };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+template <typename DstMemorySpace, typename SrcMemorySpace>
+struct verify_space<DstMemorySpace, SrcMemorySpace, false> {
+  KOKKOS_FUNCTION static void check() {
+    Kokkos::abort(
+        "Kokkos::View ERROR: attempt to access inaccessible memory space");
+  };
 };
 
 // Base class for exec space initializer factories
@@ -220,13 +226,13 @@ class LogicalMemorySpace;
 
 }  // namespace Kokkos
 
-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<            \
-      Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify(DATA_PTR)
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR)        \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             DATA_SPACE>::check();
 
-#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE)    \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-      Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify()
+#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE)                      \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             DATA_SPACE>::check();
 
 //----------------------------------------------------------------------------
 
@@ -256,8 +262,7 @@ template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace,
           int Rank, typename iType>
 struct ViewCopy;
 
-template <class Functor, class Policy, class EnableFunctor = void,
-          class EnablePolicy = void>
+template <class Functor, class Policy>
 struct FunctorPolicyExecutionSpace;
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp
index 4a573d82c044b532a58274b52d539f495d4f0ba6..1a10500b19a55f4f963807dd2cf1a28e6062f98c 100644
--- a/packages/kokkos/core/src/Kokkos_Crs.hpp
+++ b/packages/kokkos/core/src/Kokkos_Crs.hpp
@@ -199,7 +199,7 @@ class CrsRowMapFromCounts {
  public:
   KOKKOS_INLINE_FUNCTION
   void operator()(index_type i, value_type& update, bool final_pass) const {
-    if (i < m_in.size()) {
+    if (i < static_cast<index_type>(m_in.size())) {
       update += m_in(i);
       if (final_pass) m_out(i + 1) = update;
     } else if (final_pass) {
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
index 81e11f3f1285f85d2424b9e98930e3b6cb051162..7a218120bb7bb3b053335946ae25ad58c8a85e6d 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -63,6 +63,7 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -198,16 +199,6 @@ class Cuda {
 
   Cuda();
 
-  KOKKOS_FUNCTION Cuda(Cuda&& other) noexcept;
-
-  KOKKOS_FUNCTION Cuda(const Cuda& other);
-
-  KOKKOS_FUNCTION Cuda& operator=(Cuda&& other) noexcept;
-
-  KOKKOS_FUNCTION Cuda& operator=(const Cuda& other);
-
-  KOKKOS_FUNCTION ~Cuda() noexcept;
-
   Cuda(cudaStream_t stream);
 
   //--------------------------------------------------------------------------
@@ -253,13 +244,12 @@ class Cuda {
   static const char* name();
 
   inline Impl::CudaInternal* impl_internal_space_instance() const {
-    return m_space_instance;
+    return m_space_instance.get();
   }
   uint32_t impl_instance_id() const noexcept { return 0; }
 
  private:
-  Impl::CudaInternal* m_space_instance;
-  int* m_counter;
+  Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
 };
 
 namespace Tools {
@@ -319,38 +309,8 @@ struct MemorySpaceAccess<Kokkos::CudaUVMSpace,
 
 #endif
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::CudaSpace,
-                                           Kokkos::Cuda::scratch_memory_space> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::Cuda::scratch_memory_space> {
-  enum : bool { value = false };
-  inline static void verify(void) { CudaSpace::access_error(); }
-  inline static void verify(const void* p) { CudaSpace::access_error(p); }
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
-#include <Cuda/Kokkos_Cuda_Instance.hpp>
-#include <Cuda/Kokkos_Cuda_View.hpp>
-#include <Cuda/Kokkos_Cuda_Team.hpp>
-#include <Cuda/Kokkos_Cuda_Parallel.hpp>
-#include <Cuda/Kokkos_Cuda_Task.hpp>
-#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
 #endif /* #ifndef KOKKOS_CUDA_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
index fc1c0e2f8a1047cae0c57b31b2ea61d6bec92def..e10fae93c7ca01ce90f31b5d22ca9bff7d113884 100644
--- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -53,8 +53,10 @@
 #include <iosfwd>
 #include <typeinfo>
 #include <string>
+#include <memory>
 
 #include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
@@ -119,8 +121,8 @@ class CudaSpace {
 
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
-  static void access_error();
-  static void access_error(const void* const);
+  KOKKOS_DEPRECATED static void access_error();
+  KOKKOS_DEPRECATED static void access_error(const void* const);
 
  private:
   int m_device;  ///< Which Cuda device
@@ -128,42 +130,6 @@ class CudaSpace {
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
-
-namespace Impl {
-/// \brief Initialize lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function initializes the locks to zero (unset).
-void init_lock_arrays_cuda_space();
-
-/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
-
-/// \brief Retrieve the pointer to the scratch array for team and thread private
-/// global memory.
-///
-/// Team and Thread private scratch allocations in
-/// global memory are acquired via locks.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
-
-/// \brief Retrieve the pointer to the scratch array for unique identifiers.
-///
-/// Unique identifiers in the range 0-Cuda::concurrency
-/// are provided via locks.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -313,6 +279,11 @@ class CudaHostPinnedSpace {
 namespace Kokkos {
 namespace Impl {
 
+cudaStream_t cuda_get_deep_copy_stream();
+
+const std::unique_ptr<Kokkos::Cuda>& cuda_get_deep_copy_space(
+    bool initialize = true);
+
 static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace,
                                               Kokkos::CudaSpace>::assignable,
               "");
@@ -784,104 +755,21 @@ struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> {
 namespace Kokkos {
 namespace Impl {
 
-/** Running in CudaSpace attempting to access HostSpace: error */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::CudaSpace,
-                                           Kokkos::HostSpace> {
-  enum : bool { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {
-    Kokkos::abort("Cuda code attempted to access HostSpace memory");
-  }
-
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {
-    Kokkos::abort("Cuda code attempted to access HostSpace memory");
-  }
-};
-
-/** Running in CudaSpace accessing CudaUVMSpace: ok */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::CudaSpace,
-                                           Kokkos::CudaUVMSpace> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::CudaSpace,
-                                           Kokkos::CudaHostPinnedSpace> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-/** Running in CudaSpace attempting to access an unknown space: error */
-template <class OtherSpace>
-struct VerifyExecutionCanAccessMemorySpace<
-    typename std::enable_if<!std::is_same<Kokkos::CudaSpace, OtherSpace>::value,
-                            Kokkos::CudaSpace>::type,
-    OtherSpace> {
-  enum : bool { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {
-    Kokkos::abort("Cuda code attempted to access unknown Space memory");
-  }
-
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {
-    Kokkos::abort("Cuda code attempted to access unknown Space memory");
-  }
-};
-
-//----------------------------------------------------------------------------
-/** Running in HostSpace attempting to access CudaSpace */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::CudaSpace> {
-  enum : bool { value = false };
-  inline static void verify(void) { CudaSpace::access_error(); }
-  inline static void verify(const void* p) { CudaSpace::access_error(p); }
-};
-
-/** Running in HostSpace accessing CudaUVMSpace is OK */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::CudaUVMSpace> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
-/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::CudaHostPinnedSpace> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
 template <>
 class SharedAllocationRecord<Kokkos::CudaSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace> {
  private:
   friend class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;
+  friend class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
+  friend class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
 
   using RecordBase = SharedAllocationRecord<void, void>;
+  using base_t =
+      HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
   static ::cudaTextureObject_t attach_texture_object(
       const unsigned sizeof_alias, void* const alloc_ptr,
       const size_t alloc_size);
@@ -890,39 +778,19 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
   static RecordBase s_root_record;
 #endif
 
-  ::cudaTextureObject_t m_tex_obj;
+  ::cudaTextureObject_t m_tex_obj = 0;
   const Kokkos::CudaSpace m_space;
 
  protected:
   ~SharedAllocationRecord();
-  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord(
       const Kokkos::CudaSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 
  public:
-  std::string get_label() const;
-
-  static SharedAllocationRecord* allocate(const Kokkos::CudaSpace& arg_space,
-                                          const std::string& arg_label,
-                                          const size_t arg_alloc_size);
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::CudaSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
   template <typename AliasType>
   inline ::cudaTextureObject_t attach_texture_object() {
     static_assert((std::is_same<AliasType, int>::value ||
@@ -945,57 +813,35 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
     // Texture object is attached to the entire allocation range
     return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr);
   }
-
-  static void print_records(std::ostream&, const Kokkos::CudaSpace&,
-                            bool detail = false);
 };
 
 template <>
 class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public SharedAllocationRecordCommon<Kokkos::CudaUVMSpace> {
  private:
+  friend class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
+
+  using base_t     = SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
   static RecordBase s_root_record;
 
-  ::cudaTextureObject_t m_tex_obj;
+  ::cudaTextureObject_t m_tex_obj = 0;
   const Kokkos::CudaUVMSpace m_space;
 
  protected:
   ~SharedAllocationRecord();
-  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord(
       const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 
  public:
-  std::string get_label() const;
-
-  static SharedAllocationRecord* allocate(const Kokkos::CudaUVMSpace& arg_space,
-                                          const std::string& arg_label,
-                                          const size_t arg_alloc_size);
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::CudaUVMSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
   template <typename AliasType>
   inline ::cudaTextureObject_t attach_texture_object() {
     static_assert((std::is_same<AliasType, int>::value ||
@@ -1019,57 +865,32 @@ class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
     // Texture object is attached to the entire allocation range
     return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr);
   }
-
-  static void print_records(std::ostream&, const Kokkos::CudaUVMSpace&,
-                            bool detail = false);
 };
 
 template <>
 class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace> {
  private:
+  friend class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
+
   using RecordBase = SharedAllocationRecord<void, void>;
+  using base_t     = SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
   static RecordBase s_root_record;
 
   const Kokkos::CudaHostPinnedSpace m_space;
 
  protected:
   ~SharedAllocationRecord();
-  SharedAllocationRecord() : RecordBase(), m_space() {}
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord(
       const Kokkos::CudaHostPinnedSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
       const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  std::string get_label() const;
-
-  static SharedAllocationRecord* allocate(
-      const Kokkos::CudaHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::CudaHostPinnedSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&, const Kokkos::CudaHostPinnedSpace&,
-                            bool detail = false);
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 3afe0817013445d38cbcb12f65a76471d9cadb04..55aed13670e69838d94fff2735d421cc49a11835 100644
--- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -856,11 +856,12 @@ KOKKOS_INLINE_FUNCTION_DELETED
     Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType>
     ThreadVectorRange(const TeamMemberType&, const iType& count) = delete;
 
-template <typename iType, class TeamMemberType, class _never_use_this_overload>
-KOKKOS_INLINE_FUNCTION_DELETED
-    Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType>
-    ThreadVectorRange(const TeamMemberType&, const iType& arg_begin,
-                      const iType& arg_end) = delete;
+template <typename iType1, typename iType2, class TeamMemberType,
+          class _never_use_this_overload>
+KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, TeamMemberType>
+ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin,
+                  const iType2& arg_end) = delete;
 
 namespace Impl {
 
@@ -902,85 +903,6 @@ struct ParallelConstructName<FunctorType, TagType, false> {
 }  // namespace Kokkos
 
 namespace Kokkos {
-namespace Experimental {
-
-namespace Impl {
-template <class Property, class Policy>
-struct PolicyPropertyAdaptor;
-
-template <unsigned long P, template <class...> class Policy,
-          class... Properties>
-struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,
-                             Policy<Properties...>> {
-  using policy_in_t = Policy<Properties...>;
-  static_assert(is_execution_policy<policy_in_t>::value, "");
-  using policy_out_t = Policy<typename policy_in_t::traits::execution_space,
-                              typename policy_in_t::traits::schedule_type,
-                              typename policy_in_t::traits::work_tag,
-                              typename policy_in_t::traits::index_type,
-                              typename policy_in_t::traits::iteration_pattern,
-                              typename policy_in_t::traits::launch_bounds,
-                              WorkItemProperty::ImplWorkItemProperty<P>,
-                              typename policy_in_t::traits::occupancy_control>;
-};
-
-template <template <class...> class Policy, class... Properties>
-struct PolicyPropertyAdaptor<DesiredOccupancy, Policy<Properties...>> {
-  using policy_in_t = Policy<Properties...>;
-  static_assert(is_execution_policy<policy_in_t>::value, "");
-  using policy_out_t = Policy<typename policy_in_t::traits::execution_space,
-                              typename policy_in_t::traits::schedule_type,
-                              typename policy_in_t::traits::work_tag,
-                              typename policy_in_t::traits::index_type,
-                              typename policy_in_t::traits::iteration_pattern,
-                              typename policy_in_t::traits::launch_bounds,
-                              typename policy_in_t::traits::work_item_property,
-                              DesiredOccupancy>;
-  static_assert(policy_out_t::experimental_contains_desired_occupancy, "");
-};
-
-template <template <class...> class Policy, class... Properties>
-struct PolicyPropertyAdaptor<MaximizeOccupancy, Policy<Properties...>> {
-  using policy_in_t = Policy<Properties...>;
-  static_assert(is_execution_policy<policy_in_t>::value, "");
-  using policy_out_t = Policy<typename policy_in_t::traits::execution_space,
-                              typename policy_in_t::traits::schedule_type,
-                              typename policy_in_t::traits::work_tag,
-                              typename policy_in_t::traits::index_type,
-                              typename policy_in_t::traits::iteration_pattern,
-                              typename policy_in_t::traits::launch_bounds,
-                              typename policy_in_t::traits::work_item_property,
-                              MaximizeOccupancy>;
-  static_assert(!policy_out_t::experimental_contains_desired_occupancy, "");
-};
-}  // namespace Impl
-
-template <class PolicyType, unsigned long P>
-constexpr typename Impl::PolicyPropertyAdaptor<
-    WorkItemProperty::ImplWorkItemProperty<P>, PolicyType>::policy_out_t
-require(const PolicyType p, WorkItemProperty::ImplWorkItemProperty<P>) {
-  return typename Impl::PolicyPropertyAdaptor<
-      WorkItemProperty::ImplWorkItemProperty<P>, PolicyType>::policy_out_t(p);
-}
-
-template <typename Policy>
-/*constexpr*/ typename Impl::PolicyPropertyAdaptor<DesiredOccupancy,
-                                                   Policy>::policy_out_t
-prefer(Policy const& p, DesiredOccupancy occ) {
-  typename Impl::PolicyPropertyAdaptor<DesiredOccupancy, Policy>::policy_out_t
-      pwo{p};
-  pwo.impl_set_desired_occupancy(occ);
-  return pwo;
-}
-
-template <typename Policy>
-constexpr typename Impl::PolicyPropertyAdaptor<MaximizeOccupancy,
-                                               Policy>::policy_out_t
-prefer(Policy const& p, MaximizeOccupancy) {
-  return {p};
-}
-
-}  // namespace Experimental
 
 namespace Impl {
 
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
index 80a8f3ad368645f1f58aa27b129e6684088f8798..d0366b599cf8c80c92812e386ced90f6fa77eb93 100644
--- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -316,29 +316,5 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
 
 }  // namespace Kokkos
 
-namespace Kokkos {
-
-namespace Impl {
-
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::Experimental::HBWSpace> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::Experimental::HBWSpace,
-                                           Kokkos::HostSpace> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
 #endif
 #endif  // #define KOKKOS_HBWSPACE_HPP
diff --git a/packages/kokkos/core/src/Kokkos_HIP.hpp b/packages/kokkos/core/src/Kokkos_HIP.hpp
index 7afda3b43e0ddba6e8ef8776a244a0e03deba492..33cf8321c80282d5346c66afb5ee9b4be589576b 100644
--- a/packages/kokkos/core/src/Kokkos_HIP.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP.hpp
@@ -57,6 +57,7 @@
 #include <impl/Kokkos_Tags.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
+#include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
 #include <HIP/Kokkos_HIP_Parallel_Range.hpp>
 #include <HIP/Kokkos_HIP_Parallel_MDRange.hpp>
 #include <HIP/Kokkos_HIP_Parallel_Team.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
index 5d045aa27be69a3d8ca3518ffe0ea58fe11451b1..17bd681aa4b7b7aa8d98bb8253c86db81de6ce05 100644
--- a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -61,6 +61,7 @@
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 #include <hip/hip_runtime_api.h>
 /*--------------------------------------------------------------------------*/
@@ -117,8 +118,8 @@ class HIPSpace {
 
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access HIPSpace */
-  static void access_error();
-  static void access_error(const void* const);
+  KOKKOS_DEPRECATED static void access_error();
+  KOKKOS_DEPRECATED static void access_error(const void* const);
 
  private:
   int m_device;  ///< Which HIP device
@@ -128,43 +129,6 @@ class HIPSpace {
 };
 
 }  // namespace Experimental
-
-namespace Impl {
-
-/// \brief Initialize lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function initializes the locks to zero (unset).
-void init_lock_arrays_hip_space();
-
-/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
-///
-/// Arbitrary atomics are implemented using a hash table of locks
-/// where the hash value is derived from the address of the
-/// object for which an atomic operation is performed.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* atomic_lock_array_hip_space_ptr(bool deallocate = false);
-
-/// \brief Retrieve the pointer to the scratch array for team and thread private
-/// global memory.
-///
-/// Team and Thread private scratch allocations in
-/// global memory are acquired via locks.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* scratch_lock_array_hip_space_ptr(bool deallocate = false);
-
-/// \brief Retrieve the pointer to the scratch array for unique identifiers.
-///
-/// Unique identifiers in the range 0-HIP::concurrency
-/// are provided via locks.
-/// This function retrieves the lock array pointer.
-/// If the array is not yet allocated it will do so.
-int* threadid_lock_array_hip_space_ptr(bool deallocate = false);
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -483,88 +447,21 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
 namespace Kokkos {
 namespace Impl {
 
-/** Running in HIPSpace attempting to access HostSpace: error */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::Experimental::HIPSpace,
-                                           Kokkos::HostSpace> {
-  enum : bool { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {
-    Kokkos::abort("HIP code attempted to access HostSpace memory");
-  }
-
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {
-    Kokkos::abort("HIP code attempted to access HostSpace memory");
-  }
-};
-
-/** Running in HIPSpace accessing HIPHostPinnedSpace: ok */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPHostPinnedSpace> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-/** Running in HIPSpace attempting to access an unknown space: error */
-template <class OtherSpace>
-struct VerifyExecutionCanAccessMemorySpace<
-    typename std::enable_if<
-        !std::is_same<Kokkos::Experimental::HIPSpace, OtherSpace>::value,
-        Kokkos::Experimental::HIPSpace>::type,
-    OtherSpace> {
-  enum : bool { value = false };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {
-    Kokkos::abort("HIP code attempted to access unknown Space memory");
-  }
-
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {
-    Kokkos::abort("HIP code attempted to access unknown Space memory");
-  }
-};
-
-//----------------------------------------------------------------------------
-/** Running in HostSpace attempting to access HIPSpace */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,
-                                           Kokkos::Experimental::HIPSpace> {
-  enum : bool { value = false };
-  inline static void verify(void) {
-    Kokkos::Experimental::HIPSpace::access_error();
-  }
-  inline static void verify(const void* p) {
-    Kokkos::Experimental::HIPSpace::access_error(p);
-  }
-};
-
-/** Running in HostSpace accessing HIPHostPinnedSpace is OK */
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::HostSpace, Kokkos::Experimental::HIPHostPinnedSpace> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
 template <>
 class SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public HostInaccessibleSharedAllocationRecordCommon<
+          Kokkos::Experimental::HIPSpace> {
  private:
+  friend class SharedAllocationRecordCommon<Kokkos::Experimental::HIPSpace>;
+  friend class HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::HIPSpace>;
+  using base_t = HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::HIPSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
 #ifdef KOKKOS_ENABLE_DEBUG
   static RecordBase s_root_record;
 #endif
@@ -577,45 +474,23 @@ class SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>
   SharedAllocationRecord(
       const Kokkos::Experimental::HIPSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  std::string get_label() const;
-
-  static SharedAllocationRecord* allocate(
-      const Kokkos::Experimental::HIPSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::Experimental::HIPSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&,
-                            const Kokkos::Experimental::HIPSpace&,
-                            bool detail = false);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
 template <>
 class SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::HIPHostPinnedSpace> {
  private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::HIPHostPinnedSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::HIPHostPinnedSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
 #ifdef KOKKOS_ENABLE_DEBUG
   static RecordBase s_root_record;
 #endif
@@ -624,36 +499,12 @@ class SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>
 
  protected:
   ~SharedAllocationRecord();
-  SharedAllocationRecord() : RecordBase(), m_space() {}
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord(
       const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
-
- public:
-  std::string get_label() const;
-
-  static SharedAllocationRecord* allocate(
-      const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(
-      const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&,
-                            const Kokkos::Experimental::HIPHostPinnedSpace&,
-                            bool detail = false);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 }  // namespace Impl
 }  // namespace Kokkos
@@ -687,13 +538,6 @@ class HIP {
   HIP();
   HIP(hipStream_t stream);
 
-  KOKKOS_FUNCTION HIP(HIP&& other) noexcept;
-  KOKKOS_FUNCTION HIP(HIP const& other);
-  KOKKOS_FUNCTION HIP& operator=(HIP&&) noexcept;
-  KOKKOS_FUNCTION HIP& operator=(HIP const&);
-
-  KOKKOS_FUNCTION ~HIP() noexcept;
-
   //@}
   //------------------------------------
   //! \name Functions that all Kokkos devices must implement.
@@ -749,14 +593,13 @@ class HIP {
   static const char* name();
 
   inline Impl::HIPInternal* impl_internal_space_instance() const {
-    return m_space_instance;
+    return m_space_instance.get();
   }
 
   uint32_t impl_instance_id() const noexcept { return 0; }
 
  private:
-  Impl::HIPInternal* m_space_instance;
-  int* m_counter;
+  Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance;
 };
 }  // namespace Experimental
 namespace Tools {
@@ -794,27 +637,6 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
   enum : bool { deepcopy = false };
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Experimental::HIP::memory_space,
-    Kokkos::Experimental::HIP::scratch_memory_space> {
-  enum : bool { value = true };
-  KOKKOS_INLINE_FUNCTION static void verify(void) {}
-  KOKKOS_INLINE_FUNCTION static void verify(const void*) {}
-};
-
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::HostSpace, Kokkos::Experimental::HIP::scratch_memory_space> {
-  enum : bool { value = false };
-  inline static void verify(void) {
-    Kokkos::Experimental::HIPSpace::access_error();
-  }
-  inline static void verify(const void* p) {
-    Kokkos::Experimental::HIPSpace::access_error(p);
-  }
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
index 279744b77986203e65245fda8960faed227aa9c1..2100b49c116cfaecd35205aa60708ed1535578ca 100644
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ b/packages/kokkos/core/src/Kokkos_HPX.hpp
@@ -523,14 +523,6 @@ struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
   enum : bool { deepcopy = false };
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Experimental::HPX::memory_space,
-    Kokkos::Experimental::HPX::scratch_memory_space> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void *) {}
-};
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -1172,6 +1164,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_functor(arg_functor),
         m_mdr_policy(arg_policy),
         m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
 };
 }  // namespace Impl
 }  // namespace Kokkos
@@ -1715,6 +1716,15 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
         m_force_synchronous(!reducer.view().impl_track().has_record()) {}
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
 };
 }  // namespace Impl
 }  // namespace Kokkos
@@ -2438,13 +2448,14 @@ KOKKOS_INLINE_FUNCTION
       thread, count);
 }
 
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
-    ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &i_begin,
-                      const iType &i_end) {
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
+ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
+                  const iType2 &i_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
-      thread, i_begin, i_end);
+      thread, iType(i_begin), iType(i_end));
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -2615,6 +2626,27 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   }
 }
 
+/** \brief  Intra-thread vector parallel scan with reducer
+ *
+ */
+template <typename iType, class FunctorType, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                      iType, Impl::HPXTeamMember> &loop_boundaries,
+                  const FunctorType &lambda, const ReducerType &reducer) {
+  typename ReducerType::value_type scan_val;
+  reducer.init(scan_val);
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+}
+
 template <class FunctorType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
index 1d34eaf007a5df248835bd4b9cd4d977022a9091..ba69fbad393ee391eff2b59c34d4ae526fa7af29 100644
--- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -242,17 +242,17 @@ namespace Impl {
 
 template <>
 class SharedAllocationRecord<Kokkos::HostSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public SharedAllocationRecordCommon<Kokkos::HostSpace> {
  private:
   friend Kokkos::HostSpace;
+  friend class SharedAllocationRecordCommon<Kokkos::HostSpace>;
 
+  using base_t     = SharedAllocationRecordCommon<Kokkos::HostSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
 #ifdef KOKKOS_ENABLE_DEBUG
   /**\brief  Root record for tracked allocations from this HostSpace instance */
   static RecordBase s_root_record;
@@ -275,10 +275,6 @@ class SharedAllocationRecord<Kokkos::HostSpace, void>
       const RecordBase::function_type arg_dealloc = &deallocate);
 
  public:
-  inline std::string get_label() const {
-    return std::string(RecordBase::head()->m_label);
-  }
-
   KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
       const Kokkos::HostSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size) {
@@ -291,23 +287,6 @@ class SharedAllocationRecord<Kokkos::HostSpace, void>
     return (SharedAllocationRecord*)0;
 #endif
   }
-
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(const Kokkos::HostSpace& arg_space,
-                                const std::string& arg_label,
-                                const size_t arg_alloc_size);
-
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&, const Kokkos::HostSpace&,
-                            bool detail = false);
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
index 979e54da4e475cf9e7bfd7b5e3c0b3c0a3fe7c81..caa41b79b096dd2e7f2697f164d2cc3819834fc2 100644
--- a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
+++ b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
@@ -264,10 +264,10 @@ class SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace<
         static_cast<SharedAllocationRecord<void, void>*>(this);
 
     strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-            SharedAllocationHeader::maximum_label_length);
+            SharedAllocationHeader::maximum_label_length - 1);
     // Set last element zero, in case c_str is too long
     RecordBase::m_alloc_ptr
-        ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+        ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
   }
 
  public:
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
index 874b0dcc59f6f0d8b26a107d2b60f0f8912c29e2..0d0185346540bf929b4305d6ad496b2f02e39c69 100644
--- a/packages/kokkos/core/src/Kokkos_Macros.hpp
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -382,6 +382,12 @@
 #define KOKKOS_IMPL_DEVICE_FUNCTION
 #endif
 
+// Temporary solution for SYCL not supporting printf in kernels.
+// Might disappear at any point once we have found another solution.
+#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF)
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) printf(__VA_ARGS__)
+#endif
+
 //----------------------------------------------------------------------------
 // Define final version of functions. This is so that clang tidy can find these
 // macros more easily
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50223651e7d189e07cd94f9bf48eb6c5dcaa62d2
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -0,0 +1,233 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_HPP
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+
+#ifdef KOKKOS_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+#if defined(KOKKOS_ENABLE_SYCL)
+#define NAMESPACE_MATH_FUNCTIONS sycl
+#else
+#define NAMESPACE_MATH_FUNCTIONS std
+#endif
+
+#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \
+  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) {                        \
+    using NAMESPACE_MATH_FUNCTIONS::FUNC;                                    \
+    return FUNC(x);                                                          \
+  }
+
+#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE)              \
+  template <typename Integer,                                              \
+            typename = std::enable_if_t<std::is_integral<Integer>::value>> \
+  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) {                      \
+    return Kokkos::Experimental::FUNC(static_cast<double>(x));             \
+  }
+
+#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \
+  KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) {           \
+    using NAMESPACE_MATH_FUNCTIONS::FUNC;                      \
+    return FUNC(x, y);                                         \
+  }
+
+// NOTE long double overloads are not available on the device
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+
+#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
+  template <typename Arithmetic1, typename Arithmetic2,                      \
+            typename = std::enable_if_t<                                     \
+                std::is_arithmetic<Arithmetic1>::value &&                    \
+                std::is_arithmetic<Arithmetic2>::value &&                    \
+                !std::is_same<Arithmetic1, long double>::value &&            \
+                !std::is_same<Arithmetic2, long double>::value>>             \
+  KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) {         \
+    return Kokkos::Experimental::FUNC(                                       \
+        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
+                                       double, Arithmetic1>>(x),             \
+        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
+                                       double, Arithmetic2>>(y));            \
+  }
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                     \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)   \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \
+  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
+
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                  \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)  \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \
+  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)             \
+  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)  \
+  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \
+  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
+
+#define KOKKOS_IMPL_MATH_NAN()                                        \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*)
+
+#else  // long double overloads are available
+
+#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
+  template <typename Arithmetic1, typename Arithmetic2,                      \
+            typename =                                                       \
+                std::enable_if_t<std::is_arithmetic<Arithmetic1>::value &&   \
+                                 std::is_arithmetic<Arithmetic2>::value>,    \
+            typename Promoted = std::conditional_t<                          \
+                std::is_same<Arithmetic1, long double>::value ||             \
+                    std::is_same<Arithmetic2, long double>::value,           \
+                long double, double>>                                        \
+  KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) {       \
+    return Kokkos::Experimental::FUNC(                                       \
+        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
+                                       double, Arithmetic1>>(x),             \
+        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
+                                       double, Arithmetic2>>(y));            \
+  }
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                               \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)             \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double)           \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \
+  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
+
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                       \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)       \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double)      \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \
+  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                  \
+  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)       \
+  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double)      \
+  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \
+  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
+
+#define KOKKOS_IMPL_MATH_NAN()                                        \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \
+  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*)
+
+#endif
+
+// Basic operations
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
+#ifndef KOKKOS_ENABLE_SYCL
+KOKKOS_IMPL_MATH_NAN()
+#endif
+// Power functions
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(sqrt)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(cbrt)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(hypot)
+// Exponential functions
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp2)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(expm1)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(log)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(log10)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(log2)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(log1p)
+// Trigonometric functions
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(sin)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(cos)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(tan)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(asin)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(acos)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(atan)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(atan2)
+// Hyperbolic functions
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(sinh)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(cosh)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(tanh)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(asinh)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(acosh)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(atanh)
+// Error and gamma functions
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(erf)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(erfc)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(tgamma)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
+// Nearest integer floating point operations
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
+#ifndef KOKKOS_ENABLE_SYCL
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
+#endif
+// Classification and comparison
+KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
+KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
+KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
+
+#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT
+#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL
+#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT
+#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC
+#undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
+#undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
+#undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
+#undef KOKKOS_IMPL_MATH_NAN
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
index 042ad6d9023494f7650f8c17ab24c0c0f424d929..2cafac1aea462ec29fe1d1cb853cb374ea7e8109 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -408,7 +408,7 @@ class MemoryPool {
     const size_t alloc_size =
         header_size + (size_t(m_sb_count) << m_sb_size_lg2);
 
-    Record *rec = Record::allocate(memspace, "MemoryPool", alloc_size);
+    Record *rec = Record::allocate(memspace, "Kokkos::MemoryPool", alloc_size);
 
     m_tracker.assign_allocated_record_to_uninitialized(rec);
 
@@ -524,7 +524,9 @@ class MemoryPool {
     // Fast query clock register 'tic' to pseudo-randomize
     // the guess for which block within a superblock should
     // be claimed.  If not available then a search occurs.
-
+#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
+    const uint32_t block_id_hint = alloc_size;
+#else
     const uint32_t block_id_hint =
         (uint32_t)(Kokkos::Impl::clock_tic()
 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
@@ -533,6 +535,7 @@ class MemoryPool {
                    + (threadIdx.x + blockDim.x * threadIdx.y)
 #endif
         );
+#endif
 
     // expected state of superblock for allocation
     uint32_t sb_state = block_state;
diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
index 7d55a96523c218c74e6f159905aa627783b1da06..b9380cbe02b42a04c5b21b6cb8408016049d15f8 100644
--- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -42,14 +42,199 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_NUMERICTRAITS_HPP
-#define KOKKOS_NUMERICTRAITS_HPP
+#ifndef KOKKOS_NUMERIC_TRAITS_HPP
+#define KOKKOS_NUMERIC_TRAITS_HPP
 
 #include <Kokkos_Macros.hpp>
-#include <climits>
 #include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <type_traits>
 
 namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+template <class> struct infinity_helper;
+template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; };
+template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; };
+template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; };
+template <class> struct finite_min_helper;
+template <> struct finite_min_helper<bool> { static constexpr bool value = false; };
+template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; };
+template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; };
+template <> struct finite_min_helper<unsigned char> { static constexpr unsigned char value = 0; };
+template <> struct finite_min_helper<short> { static constexpr short value = SHRT_MIN; };
+template <> struct finite_min_helper<unsigned short> { static constexpr unsigned short value = 0; };
+template <> struct finite_min_helper<int> { static constexpr int value = INT_MIN; };
+template <> struct finite_min_helper<unsigned int> { static constexpr unsigned int value = 0; };
+template <> struct finite_min_helper<long int> { static constexpr long int value = LONG_MIN; };
+template <> struct finite_min_helper<unsigned long int> { static constexpr unsigned long int value = 0; };
+template <> struct finite_min_helper<long long int> { static constexpr long long int value = LLONG_MIN; };
+template <> struct finite_min_helper<unsigned long long int> { static constexpr unsigned long long int value = 0; };
+template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; };
+template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; };
+template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; };
+template <class> struct finite_max_helper;
+template <> struct finite_max_helper<bool> { static constexpr bool value = true; };
+template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; };
+template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; };
+template <> struct finite_max_helper<unsigned char> { static constexpr unsigned char value = UCHAR_MAX; };
+template <> struct finite_max_helper<short> { static constexpr short value = SHRT_MAX; };
+template <> struct finite_max_helper<unsigned short> { static constexpr unsigned short value = USHRT_MAX; };
+template <> struct finite_max_helper<int> { static constexpr int value = INT_MAX; };
+template <> struct finite_max_helper<unsigned int> { static constexpr unsigned int value = UINT_MAX; };
+template <> struct finite_max_helper<long int> { static constexpr long int value = LONG_MAX; };
+template <> struct finite_max_helper<unsigned long int> { static constexpr unsigned long int value = ULONG_MAX; };
+template <> struct finite_max_helper<long long int> { static constexpr long long int value = LLONG_MAX; };
+template <> struct finite_max_helper<unsigned long long int> { static constexpr unsigned long long int value = ULLONG_MAX; };
+template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; };
+template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; };
+template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; };
+template <class> struct epsilon_helper;
+namespace{
+  // FIXME workaround for LDL_EPSILON with XL
+  template<typename T>
+  constexpr T machineeps() {
+    T epsilon = 1, prev = 1, expression = 1;
+    do {
+      prev = epsilon;
+      epsilon /= 2;
+      expression = 1 + epsilon;
+    } while (expression > 1);
+    return prev;
+  }
+}
+template <> struct epsilon_helper<float> { static constexpr float value = FLT_EPSILON; };
+template <> struct epsilon_helper<double> { static constexpr double value = DBL_EPSILON; };
+template <> struct epsilon_helper<long double> {
+#ifdef KOKKOS_COMPILER_IBM
+  static constexpr long double value = machineeps<long double>();
+#else
+  static constexpr long double value = LDBL_EPSILON;
+#endif
+};
+template <class> struct round_error_helper;
+template <> struct round_error_helper<float> { static constexpr float value = 0.5F; };
+template <> struct round_error_helper<double> { static constexpr double value = 0.5; };
+template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; };
+template <class> struct norm_min_helper;
+template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; };
+template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; };
+template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; };
+template <class> struct digits_helper;
+template <> struct digits_helper<bool> { static constexpr int value = 1; };
+template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; };
+template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; };
+template <> struct digits_helper<unsigned char> { static constexpr int value = CHAR_BIT; };
+template <> struct digits_helper<short> { static constexpr int value = CHAR_BIT*sizeof(short)-1; };
+template <> struct digits_helper<unsigned short> { static constexpr int value = CHAR_BIT*sizeof(short); };
+template <> struct digits_helper<int> { static constexpr int value = CHAR_BIT*sizeof(int)-1; };
+template <> struct digits_helper<unsigned int> { static constexpr int value = CHAR_BIT*sizeof(int); };
+template <> struct digits_helper<long int> { static constexpr int value = CHAR_BIT*sizeof(long int)-1; };
+template <> struct digits_helper<unsigned long int> { static constexpr int value = CHAR_BIT*sizeof(long int); };
+template <> struct digits_helper<long long int> { static constexpr int value = CHAR_BIT*sizeof(long long int)-1; };
+template <> struct digits_helper<unsigned long long int> { static constexpr int value = CHAR_BIT*sizeof(long long int); };
+template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; };
+template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; };
+template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; };
+template <class> struct digits10_helper;
+template <> struct digits10_helper<bool> { static constexpr int value = 0; };
+constexpr double log10_2 = 2.41;
+#define DIGITS10_HELPER_INTEGRAL(TYPE) \
+template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; };
+DIGITS10_HELPER_INTEGRAL(char)
+DIGITS10_HELPER_INTEGRAL(signed char)
+DIGITS10_HELPER_INTEGRAL(unsigned char)
+DIGITS10_HELPER_INTEGRAL(short)
+DIGITS10_HELPER_INTEGRAL(unsigned short)
+DIGITS10_HELPER_INTEGRAL(int)
+DIGITS10_HELPER_INTEGRAL(unsigned int)
+DIGITS10_HELPER_INTEGRAL(long int)
+DIGITS10_HELPER_INTEGRAL(unsigned long int)
+DIGITS10_HELPER_INTEGRAL(long long int)
+DIGITS10_HELPER_INTEGRAL(unsigned long long int)
+#undef DIGITS10_HELPER_INTEGRAL
+template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; };
+template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; };
+template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; };
+template <class> struct max_digits10_helper;
+// FIXME not sure why were not defined in my <cfloat>
+//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
+//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
+//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
+template <> struct max_digits10_helper<float> { static constexpr int value = 9; };
+template <> struct max_digits10_helper<double> { static constexpr int value = 17; };
+template <> struct max_digits10_helper<long double> { static constexpr int value = 21; };
+template <class> struct radix_helper;
+template <> struct radix_helper<bool> { static constexpr int value = 2; };
+template <> struct radix_helper<char> { static constexpr int value = 2; };
+template <> struct radix_helper<signed char> { static constexpr int value = 2; };
+template <> struct radix_helper<unsigned char> { static constexpr int value = 2; };
+template <> struct radix_helper<short> { static constexpr int value = 2; };
+template <> struct radix_helper<unsigned short> { static constexpr int value = 2; };
+template <> struct radix_helper<int> { static constexpr int value = 2; };
+template <> struct radix_helper<unsigned int> { static constexpr int value = 2; };
+template <> struct radix_helper<long int> { static constexpr int value = 2; };
+template <> struct radix_helper<unsigned long int> { static constexpr int value = 2; };
+template <> struct radix_helper<long long int> { static constexpr int value = 2; };
+template <> struct radix_helper<unsigned long long int> { static constexpr int value = 2; };
+template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; };
+template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; };
+template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; };
+template <class> struct min_exponent_helper;
+template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; };
+template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; };
+template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; };
+template <class> struct min_exponent10_helper;
+template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; };
+template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; };
+template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; };
+template <class> struct max_exponent_helper;
+template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; };
+template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; };
+template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; };
+template <class> struct max_exponent10_helper;
+template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; };
+template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; };
+template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; };
+// clang-format on
+}  // namespace Impl
+
+#if defined(KOKKOS_ENABLE_CXX17)
+#define KOKKOS_IMPL_DEFINE_TRAIT(TRAIT)      \
+  template <class T>                         \
+  struct TRAIT : Impl::TRAIT##_helper<T> {}; \
+  template <class T>                         \
+  inline constexpr auto TRAIT##_v = TRAIT<T>::value;
+#else
+#define KOKKOS_IMPL_DEFINE_TRAIT(TRAIT) \
+  template <class T>                    \
+  struct TRAIT : Impl::TRAIT##_helper<T> {};
+#endif
+
+// Numeric distinguished value traits
+KOKKOS_IMPL_DEFINE_TRAIT(infinity)
+KOKKOS_IMPL_DEFINE_TRAIT(finite_min)
+KOKKOS_IMPL_DEFINE_TRAIT(finite_max)
+KOKKOS_IMPL_DEFINE_TRAIT(epsilon)
+KOKKOS_IMPL_DEFINE_TRAIT(round_error)
+KOKKOS_IMPL_DEFINE_TRAIT(norm_min)
+
+// Numeric characteristics traits
+KOKKOS_IMPL_DEFINE_TRAIT(digits)
+KOKKOS_IMPL_DEFINE_TRAIT(digits10)
+KOKKOS_IMPL_DEFINE_TRAIT(max_digits10)
+KOKKOS_IMPL_DEFINE_TRAIT(radix)
+KOKKOS_IMPL_DEFINE_TRAIT(min_exponent)
+KOKKOS_IMPL_DEFINE_TRAIT(min_exponent10)
+KOKKOS_IMPL_DEFINE_TRAIT(max_exponent)
+KOKKOS_IMPL_DEFINE_TRAIT(max_exponent10)
+
+#undef KOKKOS_IMPL_DEFINE_TRAIT
+
+}  // namespace Experimental
 
 template <class T>
 struct reduction_identity; /*{
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
index 1856c00a65a7ff7cfd40a91b62c9b64119a4434c..eedba38a8456117ac03d8c21e657729673017984 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -208,14 +208,6 @@ struct MemorySpaceAccess<Kokkos::OpenMP::memory_space,
   enum : bool { deepcopy = false };
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::OpenMP::memory_space, Kokkos::OpenMP::scratch_memory_space> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 4cbeef2d7d6fb2ec3a505270b190ece741cdde6d..2a57a43e63b77b7f60e4cc40bb20272e0332944a 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -150,24 +150,6 @@ class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
-namespace Kokkos {
-namespace Impl {
-
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Experimental::OpenMPTarget::memory_space,
-    Kokkos::Experimental::OpenMPTarget::scratch_memory_space> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index f7c043073c51f677516748454d3657d159caf46f..dc5e0194ab0a8bb85a29727c664a33b6c23e2c6c 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -54,8 +54,10 @@
 
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <omp.h>
+
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -145,17 +147,22 @@ namespace Impl {
 
 template <>
 class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public HostInaccessibleSharedAllocationRecordCommon<
+          Kokkos::Experimental::OpenMPTargetSpace> {
  private:
+  friend class HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::OpenMPTargetSpace>;
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::OpenMPTargetSpace>;
   friend Kokkos::Experimental::OpenMPTargetSpace;
 
+  using base_t = HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::OpenMPTargetSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
 
-  static void deallocate(RecordBase*);
-
   /**\brief  Root record for tracked allocations from this OpenMPTargetSpace
    * instance */
   static RecordBase s_root_record;
@@ -184,23 +191,9 @@ class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
 #endif
   }
 
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(
-      const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
-
   /**\brief  Reallocate tracked memory in the space */
   static void* reallocate_tracked(void* const arg_alloc_ptr,
                                   const size_t arg_alloc_size);
-
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
-
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
-
-  static void print_records(std::ostream&,
-                            const Kokkos::Experimental::OpenMPTargetSpace&,
-                            bool detail = false);
 };
 
 }  // namespace Impl
@@ -217,13 +210,20 @@ template <class ExecutionSpace>
 struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,
                 Kokkos::Experimental::OpenMPTargetSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_default_device(), omp_get_default_device());
+    // In the Release and RelWithDebInfo builds, the size of the memcpy should
+    // be greater than zero to avoid error. omp_target_memcpy returns zero on
+    // success.
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
     exec.fence();
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_default_device(), omp_get_default_device());
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_default_device()));
   }
 };
 
@@ -231,13 +231,17 @@ template <class ExecutionSpace>
 struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace,
                 ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_default_device(), omp_get_initial_device());
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
     exec.fence();
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_default_device(), omp_get_initial_device());
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
   }
 };
 
@@ -245,24 +249,20 @@ template <class ExecutionSpace>
 struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
                 ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_initial_device(), omp_get_default_device());
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_initial_device(),
+                                       omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
     exec.fence();
-    omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
-                      omp_get_initial_device(), omp_get_default_device());
+    if (n > 0)
+      OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
+                                       omp_get_initial_device(),
+                                       omp_get_default_device()));
   }
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::HostSpace, Kokkos::Experimental::OpenMPTargetSpace> {
-  enum : bool { value = false };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
index a00da4472ba1f57cc658ded9f148b4e2a735b96f..85d1dad454ba64aa1311cf19437206768018571b 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -54,6 +54,7 @@
 #include <Kokkos_ExecPolicy.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
+#include <type_traits>
 #include <typeinfo>
 
 #include <impl/Kokkos_Tags.hpp>
@@ -71,6 +72,35 @@
 namespace Kokkos {
 namespace Impl {
 
+template <class T, class = void>
+struct is_detected_execution_space : std::false_type {
+  using type = not_a_type;
+};
+
+template <class T>
+struct is_detected_execution_space<T, void_t<typename T::execution_space>>
+    : std::true_type {
+  using type = typename T::execution_space;
+};
+
+template <class T>
+using detected_execution_space_t =
+    typename is_detected_execution_space<T>::type;
+
+template <class T, class = void>
+struct is_detected_device_type : std::false_type {
+  using type = not_a_type;
+};
+
+template <class T>
+struct is_detected_device_type<T, void_t<typename T::device_type>>
+    : std::true_type {
+  using type = typename T::device_type;
+};
+
+template <class T>
+using detected_device_type_t = typename is_detected_device_type<T>::type;
+
 //----------------------------------------------------------------------------
 /** \brief  Given a Functor and Execution Policy query an execution space.
  *
@@ -79,48 +109,19 @@ namespace Impl {
  *  else if  the Functor has a device_type use that for backward compatibility
  *  else     use the default
  */
-template <class Functor, class Policy, class EnableFunctor, class EnablePolicy>
-struct FunctorPolicyExecutionSpace {
-  using execution_space = Kokkos::DefaultExecutionSpace;
-};
-
-template <class Functor, class Policy>
-struct FunctorPolicyExecutionSpace<
-    Functor, Policy,
-    typename enable_if_type<typename Functor::device_type>::type,
-    typename enable_if_type<typename Policy ::execution_space>::type> {
-  using execution_space = typename Policy::execution_space;
-};
 
 template <class Functor, class Policy>
-struct FunctorPolicyExecutionSpace<
-    Functor, Policy,
-    typename enable_if_type<typename Functor::execution_space>::type,
-    typename enable_if_type<typename Policy ::execution_space>::type> {
-  using execution_space = typename Policy::execution_space;
-};
-
-template <class Functor, class Policy, class EnableFunctor>
-struct FunctorPolicyExecutionSpace<
-    Functor, Policy, EnableFunctor,
-    typename enable_if_type<typename Policy::execution_space>::type> {
-  using execution_space = typename Policy::execution_space;
-};
-
-template <class Functor, class Policy, class EnablePolicy>
-struct FunctorPolicyExecutionSpace<
-    Functor, Policy,
-    typename enable_if_type<typename Functor::device_type>::type,
-    EnablePolicy> {
-  using execution_space = typename Functor::device_type::execution_space;
-};
-
-template <class Functor, class Policy, class EnablePolicy>
-struct FunctorPolicyExecutionSpace<
-    Functor, Policy,
-    typename enable_if_type<typename Functor::execution_space>::type,
-    EnablePolicy> {
-  using execution_space = typename Functor::execution_space;
+struct FunctorPolicyExecutionSpace {
+  using execution_space = std::conditional_t<
+      is_detected_execution_space<Policy>::value,
+      detected_execution_space_t<Policy>,
+      std::conditional_t<
+          is_detected_execution_space<Functor>::value,
+          detected_execution_space_t<Functor>,
+          std::conditional_t<
+              is_detected_device_type<Functor>::value,
+              detected_execution_space_t<detected_device_type_t<Functor>>,
+              Kokkos::DefaultExecutionSpace>>>;
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index e2e894da6e294411bcdcdb505ab9d281bad8fe3f..96242f99b0ca678e1ede6f148ae5d90a16127afe 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -744,8 +744,8 @@ struct ParallelReduceReturnValue<
   using value_type_scalar = typename return_type::value_type;
   using value_type_array  = typename return_type::value_type* const;
 
-  using value_type = typename if_c<return_type::rank == 0, value_type_scalar,
-                                   value_type_array>::type;
+  using value_type = std::conditional_t<return_type::rank == 0,
+                                        value_type_scalar, value_type_array>;
 
   static return_type& return_value(ReturnType& return_val, const FunctorType&) {
     return return_val;
@@ -1109,10 +1109,9 @@ inline void parallel_reduce(
         Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
         nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type =
-      typename Kokkos::Impl::if_c<(ValueTraits::StaticValueSize != 0),
-                                  typename ValueTraits::value_type,
-                                  typename ValueTraits::pointer_type>::type;
+  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
+                                        typename ValueTraits::value_type,
+                                        typename ValueTraits::pointer_type>;
 
   static_assert(
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
@@ -1135,10 +1134,9 @@ inline void parallel_reduce(
         Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
         nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type =
-      typename Kokkos::Impl::if_c<(ValueTraits::StaticValueSize != 0),
-                                  typename ValueTraits::value_type,
-                                  typename ValueTraits::pointer_type>::type;
+  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
+                                        typename ValueTraits::value_type,
+                                        typename ValueTraits::pointer_type>;
 
   static_assert(
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
@@ -1160,10 +1158,9 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor) {
       typename Impl::ParallelReducePolicyType<void, size_t,
                                               FunctorType>::policy_type;
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type =
-      typename Kokkos::Impl::if_c<(ValueTraits::StaticValueSize != 0),
-                                  typename ValueTraits::value_type,
-                                  typename ValueTraits::pointer_type>::type;
+  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
+                                        typename ValueTraits::value_type,
+                                        typename ValueTraits::pointer_type>;
 
   static_assert(
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
@@ -1188,10 +1185,9 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
       typename Impl::ParallelReducePolicyType<void, size_t,
                                               FunctorType>::policy_type;
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type =
-      typename Kokkos::Impl::if_c<(ValueTraits::StaticValueSize != 0),
-                                  typename ValueTraits::value_type,
-                                  typename ValueTraits::pointer_type>::type;
+  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
+                                        typename ValueTraits::value_type,
+                                        typename ValueTraits::pointer_type>;
 
   static_assert(
       Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index b8e0c74be41d98e960dc897e7526b92bfe951d84..aa720371df73cb1ad7bba8191e5c6d83c6c317c5 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -54,6 +54,7 @@
 #include <Kokkos_ScratchSpace.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 namespace Kokkos {
 namespace Experimental {
@@ -79,23 +80,22 @@ class SYCL {
 
   using scratch_memory_space = ScratchMemorySpace<SYCL>;
 
-  ~SYCL() = default;
   SYCL();
-
-  SYCL(SYCL&&)      = default;
-  SYCL(const SYCL&) = default;
-  SYCL& operator=(SYCL&&) = default;
-  SYCL& operator=(const SYCL&) = default;
+  explicit SYCL(const sycl::queue&);
 
   uint32_t impl_instance_id() const noexcept { return 0; }
 
+  sycl::context sycl_context() const noexcept {
+    return m_space_instance->m_queue->get_context();
+  };
+
   //@}
   //------------------------------------
   //! \name Functions that all Kokkos devices must implement.
   //@{
 
   KOKKOS_INLINE_FUNCTION static int in_parallel() {
-#if defined(__SYCL_ARCH__)
+#if defined(__SYCL_DEVICE_ONLY__)
     return true;
 #else
     return false;
@@ -123,25 +123,21 @@ class SYCL {
    */
 
   struct SYCLDevice {
-    SYCLDevice();
-    explicit SYCLDevice(cl::sycl::device d);
-    explicit SYCLDevice(const cl::sycl::device_selector& selector);
+    SYCLDevice() : SYCLDevice(sycl::default_selector()) {}
+    explicit SYCLDevice(sycl::device d);
+    explicit SYCLDevice(const sycl::device_selector& selector);
     explicit SYCLDevice(size_t id);
-    explicit SYCLDevice(const std::function<bool(const sycl::device&)>& pred);
 
-    cl::sycl::device get_device() const;
+    sycl::device get_device() const;
 
     friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) {
       return that.info(os);
     }
 
-    static std::ostream& list_devices(std::ostream& os);
-    static void list_devices();
-
    private:
     std::ostream& info(std::ostream& os) const;
 
-    cl::sycl::device m_device;
+    sycl::device m_device;
   };
 
   static void impl_initialize(SYCLDevice = SYCLDevice());
@@ -154,11 +150,11 @@ class SYCL {
   static const char* name();
 
   inline Impl::SYCLInternal* impl_internal_space_instance() const {
-    return m_space_instance;
+    return m_space_instance.get();
   }
 
  private:
-  Impl::SYCLInternal* m_space_instance;
+  Kokkos::Impl::HostSharedPtr<Impl::SYCLInternal> m_space_instance;
 };
 
 namespace Impl {
diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
index f9ee6ec44ee06774978af25719bd3ee60829ab66..392ab0e59a7d01f42342318bb44aa172bcb4f705 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -49,6 +49,7 @@
 
 #ifdef KOKKOS_ENABLE_SYCL
 #include <Kokkos_Concepts.hpp>
+#include <Kokkos_ScratchSpace.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_Tools.hpp>
@@ -64,6 +65,7 @@ class SYCLDeviceUSMSpace {
   using size_type       = Impl::SYCLInternal::size_type;
 
   SYCLDeviceUSMSpace();
+  explicit SYCLDeviceUSMSpace(sycl::queue queue);
 
   void* allocate(const std::size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
@@ -78,21 +80,43 @@ class SYCLDeviceUSMSpace {
  private:
   template <class, class, class, class>
   friend class LogicalMemorySpace;
-  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
-                      const size_t arg_logical_size = 0,
-                      const Kokkos::Tools::SpaceHandle =
-                          Kokkos::Tools::make_space_handle(name())) const;
-  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
-                       const size_t arg_alloc_size,
-                       const size_t arg_logical_size = 0,
-                       const Kokkos::Tools::SpaceHandle =
-                           Kokkos::Tools::make_space_handle(name())) const;
 
  public:
   static constexpr const char* name() { return "SYCLDeviceUSM"; };
 
  private:
-  int m_device;
+  sycl::queue m_queue;
+};
+
+class SYCLSharedUSMSpace {
+ public:
+  using execution_space = SYCL;
+  using memory_space    = SYCLSharedUSMSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = Impl::SYCLInternal::size_type;
+
+  SYCLSharedUSMSpace();
+  explicit SYCLSharedUSMSpace(sycl::queue queue);
+
+  void* allocate(const std::size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+
+ public:
+  static constexpr const char* name() { return "SYCLSharedUSM"; };
+
+ private:
+  sycl::queue m_queue;
 };
 }  // namespace Experimental
 
@@ -102,6 +126,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
               "");
 
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+                  Kokkos::Experimental::SYCLSharedUSMSpace,
+                  Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+              "");
+
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
                          Kokkos::Experimental::SYCLDeviceUSMSpace> {
@@ -110,6 +139,15 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                          Kokkos::HostSpace> {
@@ -118,20 +156,79 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+// SYCLSharedUSMSpace::execution_space == SYCL
+// SYCLSharedUSMSpace accessible to both SYCL and Host
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };  // SYCL cannot access HostSpace
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space
+  // Can access SYCLSharedUSMSpace from Host but cannot access
+  // SYCLDeviceUSMSpace from Host
+  enum : bool { assignable = false };
+
+  // SYCLSharedUSMSpace::execution_space can access SYCLDeviceUSMSpace
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<
+    Kokkos::Experimental::SYCLDeviceUSMSpace,
+    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+
+template <>
+struct MemorySpaceAccess<
+    Kokkos::Experimental::SYCLSharedUSMSpace,
+    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+
 }  // namespace Impl
 
 namespace Impl {
 
 template <>
 class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>
-    : public SharedAllocationRecord<void, void> {
+    : public HostInaccessibleSharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLDeviceUSMSpace> {
  private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLDeviceUSMSpace>;
+  friend class HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLDeviceUSMSpace>;
+  using base_t = HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLDeviceUSMSpace>;
   using RecordBase = SharedAllocationRecord<void, void>;
 
   SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
   SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
-
-  static void deallocate(RecordBase*);
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
 
 #ifdef KOKKOS_ENABLE_DEBUG
   static RecordBase s_root_record;
@@ -145,32 +242,38 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>
   SharedAllocationRecord(
       const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
 
- public:
-  std::string get_label() const;
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLSharedUSMSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLSharedUSMSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLSharedUSMSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
 
-  static SharedAllocationRecord* allocate(
-      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
 
-  /**\brief  Allocate tracked memory in the space */
-  static void* allocate_tracked(
-      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-      const std::string& arg_label, const size_t arg_alloc_size);
+  static RecordBase s_root_record;
 
-  /**\brief  Reallocate tracked memory in the space */
-  static void* reallocate_tracked(void* const arg_alloc_ptr,
-                                  const size_t arg_alloc_size);
+  const Kokkos::Experimental::SYCLSharedUSMSpace m_space;
 
-  /**\brief  Deallocate tracked memory in the space */
-  static void deallocate_tracked(void* const arg_alloc_ptr);
+ protected:
+  ~SharedAllocationRecord();
 
-  static SharedAllocationRecord* get_record(void* arg_alloc_ptr);
+  SharedAllocationRecord() = default;
 
-  static void print_records(std::ostream&,
-                            const Kokkos::Experimental::SYCLDeviceUSMSpace&,
-                            bool detail = false);
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
index f6b0a5fbeb04378d1c4d62b492b5c0e8632cc495..2eebf5365e71d2c5cf42c356951ccec9d041fe14 100644
--- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -70,8 +70,8 @@ class ScratchMemorySpace {
 
  private:
   mutable char* m_iter_L0 = nullptr;
-  char* m_end_L0          = nullptr;
   mutable char* m_iter_L1 = nullptr;
+  char* m_end_L0          = nullptr;
   char* m_end_L1          = nullptr;
 
   mutable int m_multiplier    = 0;
@@ -100,89 +100,46 @@ class ScratchMemorySpace {
   template <typename IntType>
   KOKKOS_INLINE_FUNCTION void* get_shmem(const IntType& size,
                                          int level = -1) const {
-    if (level == -1) level = m_default_level;
-    if (level == 0) {
-      void* tmp = m_iter_L0 + m_offset * align(size);
-      if (m_end_L0 < (m_iter_L0 += align(size) * m_multiplier)) {
-        m_iter_L0 -= align(size) * m_multiplier;  // put it back like it was
-#ifdef KOKKOS_ENABLE_DEBUG
-        // mfh 23 Jun 2015: printf call consumes 25 registers
-        // in a CUDA build, so only print in debug mode.  The
-        // function still returns nullptr if not enough memory.
-        printf(
-            "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-            "%ld byte(s); remaining capacity is %ld byte(s)\n",
-            long(size), long(m_end_L0 - m_iter_L0));
-#endif  // KOKKOS_ENABLE_DEBUG
-        tmp = nullptr;
-      }
-      return tmp;
-    } else {
-      void* tmp = m_iter_L1 + m_offset * align(size);
-      if (m_end_L1 < (m_iter_L1 += align(size) * m_multiplier)) {
-        m_iter_L1 -= align(size) * m_multiplier;  // put it back like it was
-#ifdef KOKKOS_ENABLE_DEBUG
-        // mfh 23 Jun 2015: printf call consumes 25 registers
-        // in a CUDA build, so only print in debug mode.  The
-        // function still returns nullptr if not enough memory.
-        printf(
-            "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-            "%ld byte(s); remaining capacity is %ld byte(s)\n",
-            long(size), long(m_end_L1 - m_iter_L1));
-#endif  // KOKKOS_ENABLE_DEBUG
-        tmp = nullptr;
-      }
-      return tmp;
-    }
+    return get_shmem_common</*aligned*/ false>(size, 1, level);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void* get_shmem_aligned(const ptrdiff_t size, const ptrdiff_t alignment,
-                          int level = -1) const {
+  template <typename IntType>
+  KOKKOS_INLINE_FUNCTION void* get_shmem_aligned(const IntType& size,
+                                                 const ptrdiff_t alignment,
+                                                 int level = -1) const {
+    return get_shmem_common</*aligned*/ true>(size, alignment, level);
+  }
+
+ private:
+  template <bool aligned, typename IntType>
+  KOKKOS_INLINE_FUNCTION void* get_shmem_common(const IntType& size,
+                                                const ptrdiff_t alignment,
+                                                int level = -1) const {
     if (level == -1) level = m_default_level;
-    if (level == 0) {
-      char* previous            = m_iter_L0;
-      const ptrdiff_t missalign = size_t(m_iter_L0) % alignment;
-      if (missalign) m_iter_L0 += alignment - missalign;
-
-      void* tmp = m_iter_L0 + m_offset * size;
-      if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
-        m_iter_L0 = previous;  // put it back like it was
-#ifdef KOKKOS_ENABLE_DEBUG
-        // mfh 23 Jun 2015: printf call consumes 25 registers
-        // in a CUDA build, so only print in debug mode.  The
-        // function still returns nullptr if not enough memory.
-        printf(
-            "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-            "%ld byte(s); remaining capacity is %ld byte(s)\n",
-            long(size), long(m_end_L0 - m_iter_L0));
-#endif  // KOKKOS_ENABLE_DEBUG
-        tmp = nullptr;
-      }
-      return tmp;
-    } else {
-      char* previous            = m_iter_L1;
-      const ptrdiff_t missalign = size_t(m_iter_L1) % alignment;
-      if (missalign) m_iter_L1 += alignment - missalign;
-
-      void* tmp = m_iter_L1 + m_offset * size;
-      if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
-        m_iter_L1 = previous;  // put it back like it was
+    auto& m_iter              = (level == 0) ? m_iter_L0 : m_iter_L1;
+    auto& m_end               = (level == 0) ? m_end_L0 : m_end_L1;
+    char* previous            = m_iter;
+    const ptrdiff_t missalign = size_t(m_iter) % alignment;
+    if (missalign) m_iter += alignment - missalign;
+
+    void* tmp = m_iter + m_offset * (aligned ? size : align(size));
+    if (m_end < (m_iter += (aligned ? size : align(size)) * m_multiplier)) {
+      m_iter = previous;  // put it back like it was
 #ifdef KOKKOS_ENABLE_DEBUG
-        // mfh 23 Jun 2015: printf call consumes 25 registers
-        // in a CUDA build, so only print in debug mode.  The
-        // function still returns nullptr if not enough memory.
-        printf(
-            "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-            "%ld byte(s); remaining capacity is %ld byte(s)\n",
-            long(size), long(m_end_L1 - m_iter_L1));
+      // mfh 23 Jun 2015: printf call consumes 25 registers
+      // in a CUDA build, so only print in debug mode.  The
+      // function still returns nullptr if not enough memory.
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+          "%ld byte(s); remaining capacity is %ld byte(s)\n",
+          long(size), long(m_end - m_iter));
 #endif  // KOKKOS_ENABLE_DEBUG
-        tmp = nullptr;
-      }
-      return tmp;
+      tmp = nullptr;
     }
+    return tmp;
   }
 
+ public:
   KOKKOS_DEFAULTED_FUNCTION
   ScratchMemorySpace() = default;
 
@@ -192,9 +149,9 @@ class ScratchMemorySpace {
                                             void* ptr_L1           = nullptr,
                                             const IntType& size_L1 = 0)
       : m_iter_L0((char*)ptr_L0),
-        m_end_L0(m_iter_L0 + size_L0),
         m_iter_L1((char*)ptr_L1),
-        m_end_L1(m_iter_L1 + size_L1),
+        m_end_L0((char*)ptr_L0 + size_L0),
+        m_end_L1((char*)ptr_L1 + size_L1),
         m_multiplier(1),
         m_offset(0),
         m_default_level(0) {}
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
index a1fccd37558c84c45a5e6a223664b66464556396..4d5bb2410bfaabf6f752acf55795c9d7ef82016d 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -197,14 +197,6 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
   enum : bool { deepcopy = false };
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Serial::memory_space, Kokkos::Serial::scratch_memory_space> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -474,8 +466,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using Analysis =
       FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
@@ -729,7 +721,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
  public:
   inline void execute() const { this->exec(); }
-
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
   inline ParallelFor(const FunctorType& arg_functor,
                      const MDRangePolicy& arg_policy)
       : m_functor(arg_functor),
@@ -751,8 +751,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
                                    MDRangePolicy, FunctorType>;
@@ -781,6 +781,15 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   }
 
  public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
   inline void execute() const {
     const size_t pool_reduce_size =
         Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
@@ -923,8 +932,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
 
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
index b2b2cb4473a8378e0ac3d5ee952a7bd2088ea5dd..743273670c9b5fa77f6d590596eb27fc7204396a 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -236,8 +236,8 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         Kokkos::Impl::SharedAllocationRecord<memory_space,
                                              typename queue_type::Destroy>;
 
-    record_type* record =
-        record_type::allocate(memory_space(), "TaskQueue", sizeof(queue_type));
+    record_type* record = record_type::allocate(
+        memory_space(), "Kokkos::TaskQueue", sizeof(queue_type));
 
     m_queue = new (record->data()) queue_type(arg_memory_pool);
 
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
index 1374ee7106f8e3cbbe2e2986f1b4da03d2cfb816..e827c2a2a1abd46999360c1eef57eb85428436aa 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -211,14 +211,6 @@ struct MemorySpaceAccess<Kokkos::Threads::memory_space,
   enum : bool { deepcopy = false };
 };
 
-template <>
-struct VerifyExecutionCanAccessMemorySpace<
-    Kokkos::Threads::memory_space, Kokkos::Threads::scratch_memory_space> {
-  enum : bool { value = true };
-  inline static void verify(void) {}
-  inline static void verify(const void*) {}
-};
-
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp
index 83492babcf927ed37137359ba1231acd30ce70ee..f7cc34cc114d29cbe5612bf4350fe01a498282c3 100644
--- a/packages/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp
@@ -48,6 +48,7 @@
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
 #include <array>
@@ -182,14 +183,28 @@ struct get_space_dimensionality;
 // The dimensionality of a vector is 1
 template <class T>
 struct get_space_dimensionality<std::vector<T>> {
-  static constexpr const int value = 1;
+  static constexpr int value = 1;
 };
 
 // The dimensionality of a map is 1 (the map) plus the dimensionality
 // of the map's value type
 template <class K, class V>
 struct get_space_dimensionality<std::map<K, V>> {
-  static constexpr const int value = 1 + get_space_dimensionality<V>::value;
+  static constexpr int value = 1 + get_space_dimensionality<V>::value;
+};
+
+template <class T, int N>
+struct n_dimensional_sparse_structure;
+
+template <class T>
+struct n_dimensional_sparse_structure<T, 1> {
+  using type = std::vector<T>;
+};
+
+template <class T, int N>
+struct n_dimensional_sparse_structure {
+  using type =
+      std::map<T, typename n_dimensional_sparse_structure<T, N - 1>::type>;
 };
 
 /**
@@ -286,13 +301,12 @@ template <template <class...> class Container, size_t MaxDimensionSize = 100,
 class MultidimensionalSparseTuningProblem {
  public:
   using ProblemSpaceInput = Container<TemplateArguments...>;
-  static constexpr const int space_dimensionality =
+  static constexpr int space_dimensionality =
       Impl::get_space_dimensionality<ProblemSpaceInput>::value;
-  static constexpr const size_t max_space_dimension_size = MaxDimensionSize;
-  static constexpr const double tuning_min               = 0.0;
-  static constexpr const double tuning_max               = 0.999;
-  static constexpr const double tuning_step =
-      tuning_max / max_space_dimension_size;
+  static constexpr size_t max_space_dimension_size = MaxDimensionSize;
+  static constexpr double tuning_min               = 0.0;
+  static constexpr double tuning_max               = 0.999;
+  static constexpr double tuning_step = tuning_max / max_space_dimension_size;
 
   using StoredProblemSpace =
       typename Impl::MapTypeConverter<ProblemSpaceInput>::type;
@@ -470,6 +484,72 @@ class TeamSizeTuner {
  private:
 };
 
+namespace Impl {
+
+template <typename T>
+void fill_tile(std::vector<T>& cont, int tile_size) {
+  for (int x = 1; x < tile_size; x *= 2) {
+    cont.push_back(x);
+  }
+}
+template <typename T, typename Mapped>
+void fill_tile(std::map<T, Mapped>& cont, int tile_size) {
+  for (int x = 1; x < tile_size; x *= 2) {
+    fill_tile(cont[x], tile_size / x);
+  }
+}
+}  // namespace Impl
+
+template <int MDRangeRank>
+struct MDRangeTuner {
+ private:
+  static constexpr int rank       = MDRangeRank;
+  static constexpr int max_slices = 15;
+  using SpaceDescription =
+      typename Impl::n_dimensional_sparse_structure<int, rank>::type;
+  using TunerType =
+      decltype(make_multidimensional_sparse_tuning_problem<max_slices>(
+          std::declval<SpaceDescription>(),
+          std::declval<std::vector<std::string>>()));
+  TunerType tuner;
+
+ public:
+  MDRangeTuner() = default;
+  template <typename Functor, typename TagType, typename Calculator,
+            typename... Properties>
+  MDRangeTuner(const std::string& name,
+               const Kokkos::MDRangePolicy<Properties...>& policy,
+               const Functor& functor, const TagType& tag, Calculator calc) {
+    SpaceDescription desc;
+    int max_tile_size =
+        calc.get_mdrange_max_tile_size_product(policy, functor, tag);
+    Impl::fill_tile(desc, max_tile_size);
+    std::vector<std::string> feature_names;
+    for (int x = 0; x < rank; ++x) {
+      feature_names.push_back(name + "_tile_size_" + std::to_string(x));
+    }
+    tuner = make_multidimensional_sparse_tuning_problem<max_slices>(
+        desc, feature_names);
+  }
+  template <typename Policy, typename Tuple, size_t... Indices>
+  void set_policy_tile(Policy& policy, const Tuple& tuple,
+                       const std::index_sequence<Indices...>&) {
+    policy.impl_change_tile_size({std::get<Indices>(tuple)...});
+  }
+  template <typename... Properties>
+  void tune(Kokkos::MDRangePolicy<Properties...>& policy) {
+    if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+      auto configuration = tuner.begin();
+      set_policy_tile(policy, configuration, std::make_index_sequence<rank>{});
+    }
+  }
+  void end() {
+    if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+      tuner.end();
+    }
+  }
+};
+
 }  // namespace Experimental
 }  // namespace Tools
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index 8fffe20f5580964c565ba6bd6a6f71bb484e265e..1abe0a48df5eab32f01ef703e6d39921eb9c70c3 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -789,36 +789,22 @@ class View : public ViewTraits<DataType, Properties...> {
       std::is_same<typename traits::specialize, void>::value &&
       (is_layout_left || is_layout_right || is_layout_stride);
 
-  template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
-  struct verify_space {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {}
-  };
-
-  template <class Space>
-  struct verify_space<Space, false> {
-    KOKKOS_FORCEINLINE_FUNCTION static void check() {
-      Kokkos::abort(
-          "Kokkos::View ERROR: attempt to access inaccessible memory space");
-    };
-  };
-
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
 
 #define KOKKOS_IMPL_SINK(ARG) ARG
 
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)             \
-  View::template verify_space<                            \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)                          \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();  \
   Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space> ARG;
 
 #else
 
 #define KOKKOS_IMPL_SINK(ARG)
 
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \
-  View::template verify_space<                \
-      Kokkos::Impl::ActiveExecutionMemorySpace>::check();
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)                          \
+  Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \
+                             typename traits::memory_space>::check();
 
 #endif
 
@@ -1618,7 +1604,17 @@ class View : public ViewTraits<DataType, Properties...> {
       : View(arg_prop,
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
                                            arg_N4, arg_N5, arg_N6, arg_N7)) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    KOKKOS_IMPL_IF_ON_HOST
+    Impl::runtime_check_rank_host(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());
+    else Impl::runtime_check_rank_device(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
     Impl::runtime_check_rank_host(
         traits::rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
@@ -1648,7 +1644,17 @@ class View : public ViewTraits<DataType, Properties...> {
       : View(arg_prop,
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
                                            arg_N4, arg_N5, arg_N6, arg_N7)) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    KOKKOS_IMPL_IF_ON_HOST
+    Impl::runtime_check_rank_host(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());
+    else Impl::runtime_check_rank_device(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
     Impl::runtime_check_rank_host(
         traits::rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
@@ -1692,7 +1698,17 @@ class View : public ViewTraits<DataType, Properties...> {
                   "Layout is not extent constructible. A layout object should "
                   "be passed too.\n");
 
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    KOKKOS_IMPL_IF_ON_HOST
+    Impl::runtime_check_rank_host(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());
+    else Impl::runtime_check_rank_device(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
     Impl::runtime_check_rank_host(
         traits::rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
@@ -1758,7 +1774,17 @@ class View : public ViewTraits<DataType, Properties...> {
       : View(Impl::ViewCtorProp<pointer_type>(arg_ptr),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
                                            arg_N4, arg_N5, arg_N6, arg_N7)) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    KOKKOS_IMPL_IF_ON_HOST
+    Impl::runtime_check_rank_host(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());
+    else Impl::runtime_check_rank_device(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
     Impl::runtime_check_rank_host(
         traits::rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
@@ -1838,7 +1864,17 @@ class View : public ViewTraits<DataType, Properties...> {
                      sizeof(typename traits::value_type)))),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
                                            arg_N4, arg_N5, arg_N6, arg_N7)) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    KOKKOS_IMPL_IF_ON_HOST
+    Impl::runtime_check_rank_host(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());
+    else Impl::runtime_check_rank_device(
+        traits::rank_dynamic,
+        std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
+        arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
     Impl::runtime_check_rank_host(
         traits::rank_dynamic,
         std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1,
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index eb15005cfb2d46a490c8f74d7b66f8961b7aff27..82f049ed136119c28b4add24f1460831fec55b16 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -49,7 +49,7 @@
 #if defined(KOKKOS_ENABLE_OPENMP)
 
 #if !defined(_OPENMP) && !defined(__CUDA_ARCH__) && \
-    !defined(__HIP_DEVICE_COMPILE__)
+    !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 #error \
     "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
 #endif
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index 176f452f5c33403fa802556ea8684a8db94248c9..2fc522780a495971a1d6455e19260bad0b422207 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -228,6 +228,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
         m_functor(arg_functor),
         m_mdr_policy(arg_policy),
         m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
 };
 
 }  // namespace Impl
@@ -257,8 +266,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   // Static Assert WorkTag void if ReducerType not InvalidType
 
@@ -430,8 +439,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
@@ -567,6 +576,15 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
       );*/
   }
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
 };
 
 }  // namespace Impl
@@ -963,8 +981,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
index efa06194782a4bc2e144dbab9175fff3980bf0cf..6fbb4245b8fb8b1e354452727ce9862c85a147c8 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -42,9 +42,10 @@
 //@HEADER
 */
 
+#include <Kokkos_Macros.hpp>
+
 #include <algorithm>
 #include <omp.h>
-#include <Kokkos_Macros.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -56,6 +57,7 @@
 #include <sstream>
 #include <cstring>
 
+#include <Kokkos_OpenMPTarget.hpp>
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
@@ -111,12 +113,6 @@ std::string SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
   return std::string("OpenMPTargetAllocation");
 }
 
-void SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                            void>::deallocate(SharedAllocationRecord<void, void>
-                                                  *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
 SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::OpenMPTargetSpace &arg_space,
@@ -124,7 +120,7 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
                                   void>::s_root_record,
@@ -135,12 +131,8 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
       m_space(arg_space) {
   SharedAllocationHeader header;
 
-  header.m_record = static_cast<SharedAllocationRecord<void, void> *>(this);
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
 
-  strncpy(header.m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
   // TODO DeepCopy
   // DeepCopy
   Kokkos::Impl::DeepCopy<Experimental::OpenMPTargetSpace, HostSpace>(
@@ -149,30 +141,6 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
 
 //----------------------------------------------------------------------------
 
-void *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
-    allocate_tracked(const Kokkos::Experimental::OpenMPTargetSpace &arg_space,
-                     const std::string &arg_alloc_label,
-                     const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                            void>::deallocate_tracked(void *const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
 void *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
     reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) {
   SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
@@ -190,48 +158,6 @@ void *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
   return r_new->data();
 }
 
-SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
-    *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
-                            void>::get_record(void *alloc_ptr) {
-  using Header = SharedAllocationHeader;
-  using RecordHost =
-      SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>;
-
-  if (alloc_ptr) {
-    Header head;
-    const Header *const head_ompt = Header::get_header(alloc_ptr);
-
-    Kokkos::Impl::DeepCopy<HostSpace, Experimental::OpenMPTargetSpace>(
-        &head, head_ompt, sizeof(SharedAllocationHeader));
-
-    RecordHost *record = static_cast<RecordHost *>(head.m_record);
-    if (record->m_alloc_ptr == head_ompt) {
-      return record;
-    }
-  }
-  Kokkos::Impl::throw_runtime_exception(std::string(
-      "Kokkos::Experimental::Impl::SharedAllocationRecord< "
-      "Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR"));
-  return nullptr;
-}
-
-// Iterate records to print orphaned memory ...
-void SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
-    print_records(std::ostream &s,
-                  const Kokkos::Experimental::OpenMPTargetSpace &,
-                  bool detail) {
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>::print_host_accessible_records(
-      s, "OpenMPTargetSpace", &s_root_record, detail);
-#else
-  (void)s;
-  (void)detail;
-  throw_runtime_exception(
-      "SharedAllocationRecord<OpenMPTargetSpace>::print_records"
-      " only works with KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
-
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -303,3 +229,25 @@ HOST_SPACE_ATOMIC_XOR_MASK] , 0);
 
 }
 }*/
+
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicity instantiate these CRTP base classes here,
+// where we have access to the associated *_timpl.hpp header files.
+template class HostInaccessibleSharedAllocationRecordCommon<
+    Kokkos::Experimental::OpenMPTargetSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::OpenMPTargetSpace>;
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp
similarity index 73%
rename from packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp
rename to packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp
index 5ded3ce39065dc5fb7510973de40577703429f8d..1ca30631af920badd089559874a7d24a7cfb63f7 100644
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp
@@ -42,22 +42,32 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_OPENMP
+#ifndef KOKKOS_OPENMPTARGET_ERROR_HPP
+#define KOKKOS_OPENMPTARGET_ERROR_HPP
 
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <sstream>
 
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
+namespace Kokkos {
+namespace Impl {
 
-namespace Test {
+inline void ompt_internal_safe_call(int e, const char* name,
+                                    const char* file = nullptr,
+                                    const int line   = 0) {
+  if (e != 0) {
+    std::ostringstream out;
+    out << name << " return value of " << e << " indicates failure";
+    if (file) {
+      out << " " << file << ":" << line;
+    }
+    throw_runtime_exception(out.str());
+  }
+}
 
-TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort<Kokkos::OpenMP>(); }
+#define OMPT_SAFE_CALL(call) \
+  Kokkos::Impl::ompt_internal_safe_call(call, #call, __FILE__, __LINE__)
+
+}  // namespace Impl
+}  // namespace Kokkos
 
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
 #endif
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index 74be6a37d3d0072e38c2f1148894d4bfdde6133a..f13875b440b63b729a64615a20da0f597a85cf6e 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -92,8 +92,11 @@ void OpenMPTargetExec::verify_initialized(const char* const label) {
   }
 }
 
-void* OpenMPTargetExec::m_scratch_ptr    = nullptr;
-int64_t OpenMPTargetExec::m_scratch_size = 0;
+void* OpenMPTargetExec::m_scratch_ptr         = nullptr;
+int64_t OpenMPTargetExec::m_scratch_size      = 0;
+int* OpenMPTargetExec::m_lock_array           = nullptr;
+int64_t OpenMPTargetExec::m_lock_size         = 0;
+uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr;
 
 void OpenMPTargetExec::clear_scratch() {
   Kokkos::Experimental::OpenMPTargetSpace space;
@@ -102,18 +105,28 @@ void OpenMPTargetExec::clear_scratch() {
   m_scratch_size = 0;
 }
 
+void OpenMPTargetExec::clear_lock_array() {
+  if (m_lock_array != nullptr) {
+    Kokkos::Experimental::OpenMPTargetSpace space;
+    space.deallocate(m_lock_array, m_lock_size);
+    m_lock_array = nullptr;
+    m_lock_size  = 0;
+  }
+}
+
 void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
 
-void OpenMPTargetExec::resize_scratch(int64_t reduce_bytes,
-                                      int64_t team_reduce_bytes,
-                                      int64_t team_shared_bytes,
-                                      int64_t thread_local_bytes) {
+void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
+                                      int64_t shmem_size_L1) {
   Kokkos::Experimental::OpenMPTargetSpace space;
+  const int64_t shmem_size =
+      shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
+  const int64_t padding = shmem_size * 10 / 100;  // Padding per team.
+  // Total amount of scratch memory allocated is depenedent
+  // on the maximum number of in-flight teams possible.
   int64_t total_size =
-      MAX_ACTIVE_TEAMS * reduce_bytes +         // Inter Team Reduction
-      MAX_ACTIVE_TEAMS * team_reduce_bytes +    // Intra Team Reduction
-      MAX_ACTIVE_TEAMS * team_shared_bytes +    // Team Local Scratch
-      MAX_ACTIVE_THREADS * thread_local_bytes;  // Thread Private Scratch
+      (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) *
+      (MAX_ACTIVE_THREADS / team_size);
 
   if (total_size > m_scratch_size) {
     space.deallocate(m_scratch_ptr, m_scratch_size);
@@ -121,6 +134,35 @@ void OpenMPTargetExec::resize_scratch(int64_t reduce_bytes,
     m_scratch_ptr  = space.allocate(total_size);
   }
 }
+
+int* OpenMPTargetExec::get_lock_array(int num_teams) {
+  Kokkos::Experimental::OpenMPTargetSpace space;
+  int max_active_league_size = MAX_ACTIVE_THREADS / 32;
+  int lock_array_elem =
+      (num_teams > max_active_league_size) ? num_teams : max_active_league_size;
+  if (m_lock_size < (lock_array_elem * sizeof(int))) {
+    space.deallocate(m_lock_array, m_lock_size);
+    m_lock_size  = lock_array_elem * sizeof(int);
+    m_lock_array = static_cast<int*>(space.allocate(m_lock_size));
+
+    // FIXME_OPENMPTARGET - Creating a target region here to initialize the
+    // lock_array with 0's fails. Hence creating an equivalent host array to
+    // achieve the same. Value of host array are then copied to the lock_array.
+    int* h_lock_array = static_cast<int*>(
+        omp_target_alloc(m_lock_size, omp_get_initial_device()));
+
+    for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0;
+
+    OMPT_SAFE_CALL(omp_target_memcpy(m_lock_array, h_lock_array, m_lock_size, 0,
+                                     0, omp_get_default_device(),
+                                     omp_get_initial_device()));
+
+    omp_target_free(h_lock_array, omp_get_initial_device());
+  }
+
+  return m_lock_array;
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 5bf1cdd9e119f2a76d02df7f11ddba81f0194a84..0b65e0d4a4b2270fdf577b4fffc1a10835467a47 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -51,6 +51,11 @@
 #include <Kokkos_Atomic.hpp>
 #include "Kokkos_OpenMPTarget_Abort.hpp"
 
+// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
+// hierarchical reducers. It avoids hitting the code path which we wanted to
+// write but doesn't work. undef'ed at the end.
+#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -495,7 +500,11 @@ namespace Impl {
 
 class OpenMPTargetExec {
  public:
-  enum { MAX_ACTIVE_THREADS = 256 * 8 * 56 * 4 };
+  // FIXME_OPENMPTARGET - Currently the maximum number of
+  // teams possible is calculated based on NVIDIA's Volta GPU. In
+  // future this value should be based on the chosen architecture for the
+  // OpenMPTarget backend.
+  enum { MAX_ACTIVE_THREADS = 2080 * 80 };
   enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32 };
 
  private:
@@ -505,14 +514,19 @@ class OpenMPTargetExec {
   static void verify_is_process(const char* const);
   static void verify_initialized(const char* const);
 
+  static int* get_lock_array(int num_teams);
   static void* get_scratch_ptr();
   static void clear_scratch();
-  static void resize_scratch(int64_t reduce_bytes, int64_t team_reduce_bytes,
+  static void clear_lock_array();
+  static void resize_scratch(int64_t team_reduce_bytes,
                              int64_t team_shared_bytes,
                              int64_t thread_local_bytes);
 
   static void* m_scratch_ptr;
   static int64_t m_scratch_size;
+  static int* m_lock_array;
+  static int64_t m_lock_size;
+  static uint32_t* m_uniquetoken_ptr;
 };
 
 }  // namespace Impl
@@ -542,6 +556,7 @@ class OpenMPTargetExecTeamMember {
   int m_league_size;
   int m_vector_length;
   int m_vector_lane;
+  int m_shmem_block_index;
   void* m_glb_scratch;
   void* m_reduce_scratch;
 
@@ -583,13 +598,14 @@ class OpenMPTargetExecTeamMember {
   }
 
   KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_scratch(int) const {
-    return m_team_shared.set_team_thread_mode(0, 1, 0);
+  const execution_space::scratch_memory_space& team_scratch(int level) const {
+    return m_team_shared.set_team_thread_mode(level, 1,
+                                              m_team_scratch_size[level]);
   }
 
   KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& thread_scratch(int) const {
-    return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
+  const execution_space::scratch_memory_space& thread_scratch(int level) const {
+    return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
   }
 
   KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
@@ -605,23 +621,25 @@ class OpenMPTargetExecTeamMember {
   }
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& /*value*/,
-                                             const int& /*thread_id*/) const {
-    // FIXME_OPENMPTARGET
-    /*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-        { }
-    #else
-        // Make sure there is enough scratch space:
-        using type  = typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
-                             , ValueType , void >::type;
-
-        type * const local_value = ((type*) m_exec.scratch_thread());
-        if(team_rank() == thread_id)
-          *local_value = value;
-        memory_fence();
-        team_barrier();
-        value = *local_value;
-    #endif*/
+  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
+                                             int thread_id) const {
+    // Make sure there is enough scratch space:
+    using type =
+        typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
+                                  ValueType, void>::type;
+    type* team_scratch = reinterpret_cast<type*>(
+        ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
+#pragma omp barrier
+    if (team_rank() == thread_id) *team_scratch = value;
+#pragma omp barrier
+    value = *team_scratch;
+  }
+
+  template <class Closure, class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value,
+                                             const int& thread_id) const {
+    f(value);
+    team_broadcast(value, thread_id);
   }
 
   template <class ValueType, class JoinOp>
@@ -633,8 +651,8 @@ class OpenMPTargetExecTeamMember {
     const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
 
     // Make sure there is enough scratch space:
-    using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE,
-                               value_type, void>::type;
+    using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
+                                    value_type, void>;
 
     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
     type* team_scratch =
@@ -674,7 +692,7 @@ class OpenMPTargetExecTeamMember {
     // FIXME_OPENMPTARGET
     /*  // Make sure there is enough scratch space:
       using type =
-        typename if_c<sizeof(ArgType) < TEAM_REDUCE_SIZE, ArgType, void>::type;
+        std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>;
 
       volatile type * const work_value  = ((type*) m_exec.scratch_thread());
 
@@ -733,26 +751,46 @@ class OpenMPTargetExecTeamMember {
   using space = execution_space::scratch_memory_space;
 
  public:
+  // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space
+  // for each league is saved for reduction. It should actually be based on the
+  // ValueType of the reduction variable.
   inline OpenMPTargetExecTeamMember(
       const int league_rank, const int league_size, const int team_size,
       const int vector_length  // const TeamPolicyInternal< OpenMPTarget,
                                // Properties ...> & team
       ,
-      void* const glb_scratch, const int shmem_size_L1, const int shmem_size_L2)
-      : m_team_shared(nullptr, 0),
-        m_team_scratch_size{shmem_size_L1, shmem_size_L2},
+      void* const glb_scratch, const int shmem_block_index,
+      const int shmem_size_L0, const int shmem_size_L1)
+      : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
         m_team_rank(0),
         m_team_size(team_size),
         m_league_rank(league_rank),
         m_league_size(league_size),
         m_vector_length(vector_length),
+        m_shmem_block_index(shmem_block_index),
         m_glb_scratch(glb_scratch) {
-    const int omp_tid      = omp_get_thread_num();
-    const int omp_team_num = omp_get_team_num();
-    m_reduce_scratch = (char*)glb_scratch + omp_team_num * TEAM_REDUCE_SIZE;
-    m_league_rank    = league_rank;
-    m_team_rank      = omp_tid;
-    m_vector_lane    = 0;
+    const int omp_tid = omp_get_thread_num();
+    m_team_shared     = scratch_memory_space(
+        ((char*)glb_scratch +
+         m_shmem_block_index *
+             (shmem_size_L0 + shmem_size_L1 +
+              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
+        shmem_size_L0,
+        ((char*)glb_scratch +
+         m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
+                                ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
+                                TEAM_REDUCE_SIZE)) +
+            shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
+            TEAM_REDUCE_SIZE,
+        shmem_size_L1);
+    m_reduce_scratch =
+        (char*)glb_scratch +
+        shmem_block_index *
+            (shmem_size_L0 + shmem_size_L1 +
+             ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
+    m_league_rank = league_rank;
+    m_team_rank   = omp_tid;
+    m_vector_lane = 0;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
@@ -1047,13 +1085,16 @@ TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
 }
 
-template <typename iType>
+template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
+    typename std::common_type<iType1, iType2>::type,
+    Impl::OpenMPTargetExecTeamMember>
 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType& begin, const iType& end) {
+                const iType1& begin, const iType2& end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::TeamThreadRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, begin, end);
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
+                                               iType(end));
 }
 
 template <typename iType>
@@ -1065,13 +1106,16 @@ ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
 }
 
-template <typename iType>
+template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
+    typename std::common_type<iType1, iType2>::type,
+    Impl::OpenMPTargetExecTeamMember>
 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                  const iType& arg_begin, const iType& arg_end) {
+                  const iType1& arg_begin, const iType2& arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, arg_begin, arg_end);
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
+                                               iType(arg_end));
 }
 
 template <typename iType>
@@ -1083,13 +1127,16 @@ TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
 }
 
-template <typename iType>
+template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
+    typename std::common_type<iType1, iType2>::type,
+    Impl::OpenMPTargetExecTeamMember>
 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
-                const iType& arg_begin, const iType& arg_end) {
+                const iType1& arg_begin, const iType2& arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::TeamVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, arg_begin, arg_end);
+      iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
+                                               iType(arg_end));
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -1127,26 +1174,143 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  * The range i=0..N-1 is mapped to all threads of the the calling thread team
  * and a summation of val is performed and put into result.
  */
+
 template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION void parallel_reduce(
-    const Impl::TeamThreadRangeBoundariesStruct<
-        iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-    const Lambda& lambda, ValueType& result) {
-  ValueType* tmp_scratch =
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<!Kokkos::is_reducer_type<ValueType>::value>
+    parallel_reduce(
+        const Impl::TeamThreadRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ValueType& result) {
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
 #pragma omp barrier
-  tmp_scratch[0] = ValueType();
+  TeamThread_scratch[0] = ValueType();
 #pragma omp barrier
 
-#pragma omp for reduction(+ : tmp_scratch[:1]) schedule(static, 1)
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp for reduction(+ : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamThread_scratch[0] += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp for reduction(custom : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamThread_scratch[0] += tmp;
+    }
+  }
+
+  result = TeamThread_scratch[0];
+}
+
+#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
+// For some reason the actual version we wanted to write doesn't work
+// and crashes. We should try this with every new compiler
+// This is the variant we actually wanted to write
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
+    parallel_reduce(
+        const Impl::TeamThreadRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ReducerType result) {
+  using ValueType = typename ReducerType::value_type;
+
+#pragma omp declare reduction(                                               \
+    custominner:ValueType                                                    \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  // These three lines all cause crash
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
+//  result.init(TeamThread_scratch[0]);
+//  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
+//  red.init(TeamThread_scratch[0]);
+#pragma omp barrier
+
+#pragma omp for reduction(custominner : TeamThread_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
+    ValueType tmp;
+    result.init(tmp);
     lambda(i, tmp);
-    tmp_scratch[0] += tmp;
+    // This line causes a crash
+    Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
+                                                        tmp);
+  }
+  result.reference() = TeamThread_scratch[0];
+}
+#else
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
+    parallel_reduce(
+        const Impl::TeamThreadRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ReducerType result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp declare reduction(                                               \
+    omp_red_teamthread_reducer:ValueType                                     \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp barrier
+  ValueType tmp;
+  result.init(tmp);
+  TeamThread_scratch[0] = tmp;
+#pragma omp barrier
+
+  iType team_size = iType(omp_get_num_threads());
+#pragma omp for reduction(omp_red_teamthread_reducer \
+                          : TeamThread_scratch[:1]) schedule(static, 1)
+  for (iType t = 0; t < team_size; t++) {
+    ValueType tmp2;
+    result.init(tmp2);
+
+    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
+         i += team_size) {
+      lambda(i, tmp2);
+    }
+    TeamThread_scratch[0] = tmp2;
   }
 
-  result = tmp_scratch[0];
+  result.reference() = TeamThread_scratch[0];
 }
+#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
  * ValueType & val) for each i=0..N-1.
@@ -1163,16 +1327,38 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
     const Impl::TeamThreadRangeBoundariesStruct<
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
-  ValueType result = init_result;
+  ValueType* TeamThread_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    join(result, tmp);
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+#pragma omp barrier
+  TeamThread_scratch[0] = init_result;
+#pragma omp barrier
+
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp for reduction(+ : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamThread_scratch[0] += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp for reduction(custom : TeamThread_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      join(TeamThread_scratch[0], tmp);
+    }
   }
 
-  // init_result = loop_boundaries.thread.team_reduce(result,join);
+  init_result = TeamThread_scratch[0];
 }
 
 // This is largely the same code as in HIP and CUDA except for the member name
@@ -1216,6 +1402,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 }
 
 }  // namespace Kokkos
+#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 namespace Kokkos {
 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
@@ -1244,15 +1431,52 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const Lambda& lambda, ValueType& result) {
   ValueType vector_reduce = ValueType();
+
+  if constexpr (std::is_arithmetic<ValueType>::value) {
 #pragma omp simd reduction(+ : vector_reduce)
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    vector_reduce += tmp;
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      vector_reduce += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp simd reduction(custom : vector_reduce)
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      lambda(i, vector_reduce);
+    }
   }
+
   result = vector_reduce;
 }
 
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
+    parallel_reduce(
+        const Impl::ThreadVectorRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+#pragma omp declare reduction(                                               \
+    custom:ValueType                                                         \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  ValueType vector_reduce;
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce);
+
+#pragma omp simd reduction(custom : vector_reduce)
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, vector_reduce);
+  }
+
+  result.reference() = vector_reduce;
+}
+
 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
  * ValueType & val) for each i=0..N-1.
  *
@@ -1269,14 +1493,15 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
   ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
+
+  // FIXME_OPENMPTARGET think about omp simd
+  // join does not work with omp reduction clause
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
     ValueType tmp = ValueType();
     lambda(i, tmp);
     join(result, tmp);
   }
+
   init_result = result;
 }
 
@@ -1324,7 +1549,7 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamVectorRangeBoundariesStruct<
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const Lambda& lambda) {
-#pragma omp for simd
+#pragma omp for simd nowait schedule(static, 1)
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
 }
 
@@ -1339,22 +1564,130 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
     const Impl::TeamVectorRangeBoundariesStruct<
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const Lambda& lambda, ValueType& result) {
-  ValueType* tmp_scratch =
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamVector_scratch =
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
 #pragma omp barrier
-  tmp_scratch[0] = ValueType();
+  TeamVector_scratch[0] = ValueType();
 #pragma omp barrier
 
-#pragma omp for simd reduction(+ : tmp_scratch[:1])
+  if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp for simd reduction(+ : TeamVector_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamVector_scratch[0] += tmp;
+    }
+  } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+
+#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
+    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+      ValueType tmp = ValueType();
+      lambda(i, tmp);
+      TeamVector_scratch[0] += tmp;
+    }
+  }
+
+  result = TeamVector_scratch[0];
+}
+
+#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
+    parallel_reduce(
+        const Impl::TeamVectorRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+#pragma omp declare reduction(                                               \
+    custom:ValueType                                                         \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+  ValueType* TeamVector_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp barrier
+  Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]);
+#pragma omp barrier
+
+#pragma omp for simd reduction(custom : TeamVector_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
     ValueType tmp = ValueType();
     lambda(i, tmp);
-    tmp_scratch[0] += tmp;
+    TeamVector_scratch[0] += tmp;
+  }
+
+  result.reference() = TeamVector_scratch[0];
+}
+#else
+template <typename iType, class Lambda, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
+    parallel_reduce(
+        const Impl::TeamVectorRangeBoundariesStruct<
+            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+        const Lambda& lambda, ReducerType const& result) {
+  using ValueType = typename ReducerType::value_type;
+
+  // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
+  // elements in the array <= 32. For reduction we allocate, 16 bytes per
+  // element in the scratch space, hence, 16*32 = 512.
+  static_assert(sizeof(ValueType) <=
+                Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
+
+  ValueType* TeamVector_scratch =
+      static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
+
+#pragma omp declare reduction(                                               \
+    omp_red_teamthread_reducer:ValueType                                     \
+    : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(                                                             \
+        Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp barrier
+  ValueType tmp;
+  result.init(tmp);
+  TeamVector_scratch[0] = tmp;
+#pragma omp barrier
+
+  iType team_size = iType(omp_get_num_threads());
+#pragma omp for simd reduction(omp_red_teamthread_reducer \
+                               : TeamVector_scratch[:1]) schedule(static, 1)
+  for (iType t = 0; t < team_size; t++) {
+    ValueType tmp2;
+    result.init(tmp2);
+
+    for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
+         i += team_size) {
+      lambda(i, tmp2);
+    }
+    TeamVector_scratch[0] = tmp2;
   }
-  result = tmp_scratch[0];
+
+  result.reference() = TeamVector_scratch[0];
 }
+#endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 }  // namespace Kokkos
 
+#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+
 namespace Kokkos {
 
 template <class FunctorType>
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 2d68a34c9fe2f5fb500c133ee07740d502dff8b6..4a79b72732dafb9bd93613723551ec7a9b01ddd1 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -46,7 +46,12 @@
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
 
+// FIXME_OPENMPTARGET - macro for workaround implementation in UniqueToken
+// constructor. undef'ed at the end
+#define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND
+
 #include <Kokkos_OpenMPTarget.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp>
 
 #include <sstream>
@@ -63,7 +68,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
   printf("Using OpenMPTarget\n");
 }
 
-void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; }
+void OpenMPTargetInternal::impl_finalize() {
+  m_is_initialized = false;
+  Kokkos::Impl::OpenMPTargetExec space;
+  if (space.m_lock_array != nullptr) space.clear_lock_array();
+
+  if (space.m_uniquetoken_ptr != nullptr)
+    Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
+        space.m_uniquetoken_ptr);
+}
 void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; }
 int OpenMPTargetInternal::impl_is_initialized() {
   return m_is_initialized ? 1 : 0;
@@ -149,4 +162,48 @@ void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
 }  // namespace Impl
 }  // Namespace Kokkos
 
+namespace Kokkos {
+namespace Experimental {
+
+UniqueToken<Kokkos::Experimental::OpenMPTarget,
+            Kokkos::Experimental::UniqueTokenScope::Global>::
+    UniqueToken(Kokkos::Experimental::OpenMPTarget const&) {
+#ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND
+  uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr;
+  int count     = Kokkos::Experimental::OpenMPTarget().concurrency();
+  if (ptr == nullptr) {
+    int size = count * sizeof(uint32_t);
+    ptr      = static_cast<uint32_t*>(
+        Kokkos::kokkos_malloc<Kokkos::Experimental::OpenMPTargetSpace>(
+            "Kokkos::OpenMPTarget::m_uniquetoken_ptr", size));
+    std::vector<uint32_t> h_buf(count, 0);
+    OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, 0,
+                                     omp_get_default_device(),
+                                     omp_get_initial_device()));
+
+    Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr = ptr;
+  }
+#else
+// FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr`
+// with 0's
+// Version 1 - Creating a target region and filling the
+// pointer Error - CUDA error: named symbol not found
+#pragma omp target teams distribute parallel for is_device_ptr(ptr) \
+    map(to                                                          \
+        : size)
+  for (int i = 0; i < count; ++i) ptr[i] = 0;
+
+  // Version 2 : Allocating a view on the device and filling it with a scalar
+  // value of 0.
+  Kokkos::View<uint32_t*, Kokkos::Experimental::OpenMPTargetSpace> ptr_view(
+      ptr, count);
+  Kokkos::deep_copy(ptr_view, 0);
+#endif
+  m_buffer = ptr;
+  m_count  = count;
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#undef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND
 #endif  // defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index 862a63672dc4fba8ec40c77a24659cf0a3b64da1..a4092c3a37a7e9a1493576c5efe783334982a391 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -51,6 +51,8 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
+#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
+
 namespace Kokkos {
 namespace Impl {
 
@@ -84,8 +86,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   */
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_impl() const {
+  inline void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -97,27 +98,13 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     FunctorType a_functor(m_functor);
 
+    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for map(to : a_functor)
-    for (auto i = begin; i < end; i++) a_functor(i);
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_impl() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto begin = m_policy.begin();
-    const auto end   = m_policy.end();
-
-    if (end <= begin) return;
-
-    FunctorType a_functor(m_functor);
-#pragma omp target teams distribute parallel for num_threads(128) \
-    map(to                                                        \
-        : a_functor)
-    for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+      for (auto i = begin; i < end; i++) a_functor(i);
+    } else {
+#pragma omp target teams distribute parallel for map(to : a_functor)
+      for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+    }
   }
 
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
@@ -134,8 +121,8 @@ namespace Kokkos {
 namespace Impl {
 
 template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType, int FunctorHasJoin,
-          int UseReducerType>
+          class PointerType, class ValueType, bool FunctorHasJoin,
+          bool UseReducerType>
 struct ParallelReduceSpecialize {
   static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
@@ -150,13 +137,12 @@ struct ParallelReduceSpecialize {
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
+                                ReducerType, PointerType, ValueType, false,
+                                false> {
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
+  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
+                                  PointerType result_ptr) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -167,32 +153,15 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     if (end <= begin) return;
 
     ValueType result = ValueType();
+    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for num_teams(512) \
                 map(to:f) map(tofrom:result) reduction(+: result)
-    for (auto i = begin; i < end; i++) f(i, result);
-
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto begin = p.begin();
-    const auto end   = p.end();
-
-    if (end <= begin) return;
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for \
-                num_teams(512) map(to:f) map(tofrom: result) \
-                reduction(+: result)
-    for (auto i = begin; i < end; i++) f(TagType(), i, result);
+      for (auto i = begin; i < end; i++) f(i, result);
+    } else {
+#pragma omp target teams distribute parallel for num_teams(512) \
+                map(to:f) map(tofrom:result) reduction(+: result)
+      for (auto i = begin; i < end; i++) f(TagType(), i, result);
+    }
 
     *result_ptr = result;
   }
@@ -206,17 +175,15 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
 template <class FunctorType, class PolicyType, class ReducerType,
           class PointerType, class ValueType>
 struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
-                                PointerType, ValueType, 0, 1> {
+                                PointerType, ValueType, false, true> {
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
+  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
+                                  PointerType result_ptr) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -229,38 +196,21 @@ struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
     ValueType result = ValueType();
     OpenMPTargetReducerWrapper<ReducerType>::init(result);
 
-// clang-format off
-#pragma omp target teams distribute parallel for num_teams(512) map(to: f) \
-    map(tofrom: result) reduction(custom: result)
-    for (auto i = begin; i < end; i++) f(i, result);
-    // clang-format on
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    if (end <= begin) return;
-
-    ValueType result = ValueType();
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
-
-// clang-format off
-#pragma omp target teams distribute parallel for num_teams(512) map(to: f) \
-    map(tofrom: result) reduction(custom: result)
-    for (auto i = begin; i < end; i++) f(TagType(), i, result);
-    // clang-format on
-
-    *result_ptr = result;
+    if constexpr (std::is_same<TagType, void>::value) {
+#pragma omp target teams distribute parallel for num_teams(512) map(to   \
+                                                                    : f) \
+    reduction(custom                                                     \
+              : result)
+      for (auto i = begin; i < end; i++) f(i, result);
+      *result_ptr = result;
+    } else {
+#pragma omp target teams distribute parallel for num_teams(512) map(to   \
+                                                                    : f) \
+    reduction(custom                                                     \
+              : result)
+      for (auto i = begin; i < end; i++) f(TagType(), i, result);
+      *result_ptr = result;
+    }
   }
 
   inline static void execute(const FunctorType& f, const PolicyType& p,
@@ -284,8 +234,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   // Static Assert WorkTag void if ReducerType not InvalidType
 
@@ -347,114 +297,143 @@ namespace Impl {
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                    Kokkos::Experimental::OpenMPTarget> {
- private:
+ protected:
   using Policy = Kokkos::RangePolicy<Traits...>;
 
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
+  using idx_type  = typename Policy::index_type;
 
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
   using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
   using ValueJoin   = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
   using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
 
+  using value_type     = typename ValueTraits::value_type;
   using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
   const FunctorType m_functor;
   const Policy m_policy;
-  /*
-    template< class TagType >
-    inline static
-    typename std::enable_if< std::is_same< TagType , void >::value >::type
-    exec_range( const FunctorType & functor
-              , const Member ibeg , const Member iend
-              , reference_type update , const bool final )
+
+  template <class TagType>
+  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
+  call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
+                const bool& is_final) const {
+    f(idx, val, is_final);
+  }
+  template <class TagType>
+  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
+                const bool& is_final) const {
+    f(WorkTag(), idx, val, is_final);
+  }
+
+ public:
+  inline void impl_execute(
+      Kokkos::View<value_type**, Kokkos::LayoutRight,
+                   Kokkos::Experimental::OpenMPTargetSpace>
+          element_values,
+      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+          chunk_values,
+      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count)
+      const {
+    const idx_type N          = m_policy.end() - m_policy.begin();
+    const idx_type chunk_size = 128;
+    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
+    idx_type nteams           = n_chunks > 512 ? 512 : n_chunks;
+    idx_type team_size        = 128;
+
+    FunctorType a_functor(m_functor);
+#pragma omp target teams distribute map(to                             \
+                                        : a_functor) num_teams(nteams) \
+    thread_limit(team_size)
+    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+#pragma omp parallel num_threads(team_size)
       {
-        #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-        #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-        #pragma ivdep
-        #endif
-        #endif
-        for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
-          functor( iwork , update , final );
+        const idx_type local_offset = team_id * chunk_size;
+
+#pragma omp for
+        for (idx_type i = 0; i < chunk_size; i++) {
+          const idx_type idx = local_offset + i;
+          value_type val;
+          ValueInit::init(a_functor, &val);
+          if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false);
+          element_values(team_id, i) = val;
+        }
+#pragma omp barrier
+        if (omp_get_thread_num() == 0) {
+          value_type sum;
+          ValueInit::init(a_functor, &sum);
+          for (idx_type i = 0; i < chunk_size; i++) {
+            ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
+            element_values(team_id, i) = sum;
+          }
+          chunk_values(team_id) = sum;
+        }
+#pragma omp barrier
+        if (omp_get_thread_num() == 0) {
+          if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
+            value_type sum;
+            ValueInit::init(a_functor, &sum);
+            for (idx_type i = 0; i < n_chunks; i++) {
+              ValueJoin::join(a_functor, &sum, &chunk_values(i));
+              chunk_values(i) = sum;
+            }
+          }
         }
       }
+    }
 
-    template< class TagType >
-    inline static
-    typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-    exec_range( const FunctorType & functor
-              , const Member ibeg , const Member iend
-              , reference_type update , const bool final )
+#pragma omp target teams distribute map(to                             \
+                                        : a_functor) num_teams(nteams) \
+    thread_limit(team_size)
+    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+#pragma omp parallel num_threads(team_size)
       {
-        const TagType t{} ;
-        #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
-        #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-        #pragma ivdep
-        #endif
-        #endif
-        for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
-          functor( t , iwork , update , final );
+        const idx_type local_offset = team_id * chunk_size;
+        value_type offset_value;
+        if (team_id > 0)
+          offset_value = chunk_values(team_id - 1);
+        else
+          ValueInit::init(a_functor, &offset_value);
+
+#pragma omp for
+        for (idx_type i = 0; i < chunk_size; i++) {
+          const idx_type idx = local_offset + i;
+          value_type local_offset_value;
+          if (i > 0) {
+            local_offset_value = element_values(team_id, i - 1);
+            ValueJoin::join(a_functor, &local_offset_value, &offset_value);
+          } else
+            local_offset_value = offset_value;
+          if (idx < N)
+            call_with_tag<WorkTag>(a_functor, idx, local_offset_value, true);
         }
       }
-  */
- public:
-  inline void execute() const {
-    /*      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-    parallel_scan");
-          OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-    parallel_scan");
-
-          OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size(
-    m_functor ) , 0 );
-
-    #pragma omp parallel
-          {
-            OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
-            const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size()
-    ); const pointer_type ptr = pointer_type( exec.scratch_reduce() ) +
-              ValueTraits::value_count( m_functor );
-            ParallelScan::template exec_range< WorkTag >
-              ( m_functor , range.begin() , range.end()
-              , ValueInit::init( m_functor , ptr ) , false );
-          }
-
-          {
-            const unsigned thread_count = OpenMPTargetExec::pool_size();
-            const unsigned value_count  = ValueTraits::value_count( m_functor );
-
-            pointer_type ptr_prev = 0 ;
-
-            for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
-
-              pointer_type ptr = pointer_type(
-    OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
-
-              if ( ptr_prev ) {
-                for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] =
-    ptr_prev[ i + value_count ] ; } ValueJoin::join( m_functor , ptr +
-    value_count , ptr );
-              }
-              else {
-                ValueInit::init( m_functor , ptr );
-              }
-
-              ptr_prev = ptr ;
-            }
-          }
+    }
+  }
 
-    #pragma omp parallel
-          {
-            OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
-            const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size()
-    ); const pointer_type ptr = pointer_type( exec.scratch_reduce() );
-            ParallelScan::template exec_range< WorkTag >
-              ( m_functor , range.begin() , range.end()
-              , ValueOps::reference( ptr ) , true );
-          }
-    */
+  inline void execute() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const idx_type N          = m_policy.end() - m_policy.begin();
+    const idx_type chunk_size = 128;
+    const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
+
+    // This could be scratch memory per team
+    Kokkos::View<value_type**, Kokkos::LayoutRight,
+                 Kokkos::Experimental::OpenMPTargetSpace>
+        element_values("element_values", n_chunks, chunk_size);
+    Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+        chunk_values("chunk_values", n_chunks);
+    Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
+        "Count");
+
+    impl_execute(element_values, chunk_values, count);
   }
 
   //----------------------------------------
@@ -465,6 +444,51 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   //----------------------------------------
 };
 
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Experimental::OpenMPTarget>
+    : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                          Kokkos::Experimental::OpenMPTarget> {
+  using base_t     = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                              Kokkos::Experimental::OpenMPTarget>;
+  using value_type = typename base_t::value_type;
+  value_type& m_returnvalue;
+
+ public:
+  inline void execute() const {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    const int64_t N        = base_t::m_policy.end() - base_t::m_policy.begin();
+    const int chunk_size   = 128;
+    const int64_t n_chunks = (N + chunk_size - 1) / chunk_size;
+
+    if (N > 0) {
+      // This could be scratch memory per team
+      Kokkos::View<value_type**, Kokkos::LayoutRight,
+                   Kokkos::Experimental::OpenMPTargetSpace>
+          element_values("element_values", n_chunks, chunk_size);
+      Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
+          chunk_values("chunk_values", n_chunks);
+      Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
+          "Count");
+
+      base_t::impl_execute(element_values, chunk_values, count);
+
+      const int size = base_t::ValueTraits::value_size(base_t::m_functor);
+      DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
+          &m_returnvalue, chunk_values.data() + (n_chunks - 1), size);
+    } else {
+      m_returnvalue = 0;
+    }
+  }
+
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const typename base_t::Policy& arg_policy,
+                        ReturnType& arg_returnvalue)
+      : base_t(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -499,8 +523,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  private:
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_impl() const {
+  inline void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -508,59 +531,94 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto league_size   = m_policy.league_size();
     const auto team_size     = m_policy.team_size();
     const auto vector_length = m_policy.impl_vector_length();
-    const auto nteams        = OpenMPTargetExec::MAX_ACTIVE_TEAMS < league_size
-                            ? OpenMPTargetExec::MAX_ACTIVE_TEAMS
-                            : league_size;
 
-    OpenMPTargetExec::resize_scratch(0, Policy::member_type::TEAM_REDUCE_SIZE,
-                                     0, 0);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+    const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
 
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
     FunctorType a_functor(m_functor);
-#pragma omp target teams distribute map(to           \
-                                        : a_functor) \
-    is_device_ptr(scratch_ptr) num_teams(nteams) thread_limit(team_size)
-    for (int i = 0; i < league_size; i++) {
-#pragma omp parallel num_threads(team_size)
-      {
-        typename Policy::member_type team(i, league_size, team_size,
-                                          vector_length, scratch_ptr, 0, 0);
-        m_functor(team);
-      }
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_impl() const {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const auto league_size   = m_policy.league_size();
-    const auto team_size     = m_policy.team_size();
-    const auto vector_length = m_policy.impl_vector_length();
-    const auto nteams        = OpenMPTargetExec::MAX_ACTIVE_TEAMS < league_size
-                            ? OpenMPTargetExec::MAX_ACTIVE_TEAMS
-                            : league_size;
 
-    FunctorType a_functor(m_functor);
+    // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the
+    // scratch implementation does not work in the Release or RelWithDebugInfo
+    // mode but works in the Debug mode.
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    // nteams should not exceed the maximum in-flight teams possible.
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
+// Performing our own scheduling of teams to avoid separation of code between
+// teams-distribute and parallel. Gave a 2x performance boost in test cases with
+// the clang compiler. atomic_compare_exchange can be avoided since the standard
+// guarantees that the number of teams specified in the `num_teams` clause is
+// always less than or equal to the maximum concurrently running teams.
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) \
+    map(to                                                         \
+        : a_functor) is_device_ptr(scratch_ptr)
+#pragma omp parallel
+    {
+      const int blockIdx = omp_get_team_num();
+      const int gridDim  = omp_get_num_teams();
+
+      // Iterate through the number of teams until league_size and assign the
+      // league_id accordingly
+      // Guarantee that the compilers respect the `num_teams` clause
+      if (gridDim <= nteams) {
+        for (int league_id = blockIdx; league_id < league_size;
+             league_id += gridDim) {
+          typename Policy::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              blockIdx, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value)
+            m_functor(team);
+          else
+            m_functor(TagType(), team);
+        }
+      } else
+        Kokkos::abort("`num_teams` clause was not respected.\n");
+    }
 
-    OpenMPTargetExec::resize_scratch(0, Policy::member_type::TEAM_REDUCE_SIZE,
-                                     0, 0);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-#pragma omp target teams distribute map(to           \
-                                        : a_functor) \
-    is_device_ptr(scratch_ptr) num_teams(nteams) thread_limit(team_size)
+#else
+// Saving the older implementation that uses `atomic_compare_exchange` to
+// calculate the shared memory block index and `distribute` clause to distribute
+// teams.
+#pragma omp target teams distribute map(to                   \
+                                        : a_functor)         \
+    is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
+        thread_limit(team_size)
     for (int i = 0; i < league_size; i++) {
+      int shmem_block_index = -1, lock_team = 99999, iter = -1;
+      iter = (omp_get_team_num() % max_active_teams);
+
+      // Loop as long as a shmem_block_index is not found.
+      while (shmem_block_index == -1) {
+        // Try and acquire a lock on the index.
+        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
+
+        // If lock is acquired assign it to the block index.
+        // lock_team = 0, implies atomic_compare_exchange is successfull.
+        if (lock_team == 0)
+          shmem_block_index = iter;
+        else
+          iter = ++iter % max_active_teams;
+      }
+
 #pragma omp parallel num_threads(team_size)
       {
-        typename Policy::member_type team(i / (team_size * vector_length),
-                                          league_size, team_size, vector_length,
-                                          scratch_ptr, 0, 0);
-        m_functor(TagType(), team);
+        typename Policy::member_type team(
+            i, league_size, team_size, vector_length, scratch_ptr,
+            shmem_block_index, shmem_size_L0, shmem_size_L1);
+        m_functor(team);
       }
+
+      // Free the locked block and increment the number of available free
+      // blocks.
+      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
     }
+#endif
   }
 
  public:
@@ -575,14 +633,13 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
+                                ReducerType, PointerType, ValueType, false,
+                                false> {
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
+  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
+                                  PointerType result_ptr) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -591,68 +648,150 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const int league_size   = p.league_size();
     const int team_size     = p.team_size();
     const int vector_length = p.impl_vector_length();
-    const int nteams        = OpenMPTargetExec::MAX_ACTIVE_TEAMS < league_size
-                           ? OpenMPTargetExec::MAX_ACTIVE_TEAMS
-                           : league_size;
 
-    OpenMPTargetExec::resize_scratch(
-        0, PolicyType::member_type::TEAM_REDUCE_SIZE, 0, 0);
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     ValueType result = ValueType();
 
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+    {
+      const int blockIdx = omp_get_team_num();
+      const int gridDim  = omp_get_num_teams();
+
+      // Guarantee that the compilers respect the `num_teams` clause
+      if (gridDim <= nteams) {
+        for (int league_id = blockIdx; league_id < league_size;
+             league_id += gridDim) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              blockIdx, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value)
+            f(team, result);
+          else
+            f(TagType(), team, result);
+        }
+      } else
+        Kokkos::abort("`num_teams` clause was not respected.\n");
+    }
+
+    *result_ptr = result;
+#else
+// Saving the older implementation that uses `atomic_compare_exchange` to
+// calculate the shared memory block index and `distribute` clause to distribute
+// teams.
 #pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
          map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr)
+    is_device_ptr(scratch_ptr, lock_array)
     for (int i = 0; i < league_size; i++) {
       ValueType inner_result = ValueType();
+      int shmem_block_index = -1, lock_team = 99999, iter = -1;
+      iter = (omp_get_team_num() % max_active_teams);
+
+      // Loop as long as a shmem_block_index is not found.
+      while (shmem_block_index == -1) {
+        // Try and acquire a lock on the index.
+        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
+
+        // If lock is acquired assign it to the block index.
+        // lock_team = 0, implies atomic_compare_exchange is successfull.
+        if (lock_team == 0)
+          shmem_block_index = iter;
+        else
+          iter = ++iter % max_active_teams;
+      }
 #pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
       {
-        typename PolicyType::member_type team(i, league_size, team_size,
-                                              vector_length, scratch_ptr, 0, 0);
+        typename PolicyType::member_type team(
+            i, league_size, team_size, vector_length, scratch_ptr,
+            shmem_block_index, shmem_size_L0, shmem_size_L1);
         f(team, inner_result);
       }
       result = inner_result;
+
+      // Free the locked block and increment the number of available free
+      // blocks.
+      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
     }
 
     *result_ptr = result;
+#endif
   }
 
+  inline static void execute(const FunctorType& f, const PolicyType& p,
+                             PointerType ptr) {
+    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  }
+};
+
+template <class FunctorType, class ReducerType, class PointerType,
+          class ValueType, class... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
+                                ReducerType, PointerType, ValueType, false,
+                                true> {
+  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
+  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
+                                  PointerType result_ptr) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
 
-    const int league_size   = p.league_size();
-    const int team_size     = p.team_size();
-    const int vector_length = p.impl_vector_length();
-    const int nteams        = OpenMPTargetExec::MAX_ACTIVE_TEAMS < league_size
-                           ? OpenMPTargetExec::MAX_ACTIVE_TEAMS
-                           : league_size;
-
-    OpenMPTargetExec::resize_scratch(
-        0, PolicyType::member_type::TEAM_REDUCE_SIZE, 0, 0);
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+    const int league_size      = p.league_size();
+    const int team_size        = p.team_size();
+    const int vector_length    = p.impl_vector_length();
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     ValueType result = ValueType();
 
-#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
-         map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr)
-    for (int i = 0; i < league_size; i++) {
-      ValueType inner_result = ValueType();
-#pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
-      {
-        typename PolicyType::member_type team(i, league_size, team_size,
-                                              vector_length, scratch_ptr, 0, 0);
-        f(TagType(), team, result);
-      }
-      result = inner_result;
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(custom                             \
+                                         : result)
+#pragma omp parallel reduction(custom : result)
+    {
+      const int blockIdx = omp_get_team_num();
+      const int gridDim  = omp_get_num_teams();
+
+      // Guarantee that the compilers respect the `num_teams` clause
+      if (gridDim <= nteams) {
+        for (int league_id = blockIdx; league_id < league_size;
+             league_id += gridDim) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              blockIdx, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value)
+            f(team, result);
+          else
+            f(TagType(), team, result);
+        }
+      } else
+        Kokkos::abort("`num_teams` clause was not respected.\n");
     }
 
     *result_ptr = result;
@@ -680,8 +819,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
@@ -790,61 +929,11 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
       : start(begin_), end(end_), team(thread_) {}
 };
 
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, iType count) {
-  return Impl::TeamThreadRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, iType begin,
-                iType end) {
-  return Impl::TeamThreadRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, begin, end);
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, iType count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, iType begin,
-                  iType end) {
-  return Impl::ThreadVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, begin, end);
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, iType count) {
-  return Impl::TeamVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
-}
-
-template <typename iType>
-KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    iType, Impl::OpenMPTargetExecTeamMember>
-ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, iType begin,
-                  iType end) {
-  return Impl::TeamVectorRangeBoundariesStruct<
-      iType, Impl::OpenMPTargetExecTeamMember>(thread, begin, end);
-}
-
 }  // namespace Impl
 
 }  // namespace Kokkos
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 2bd8a5d3a39ca54b178c465b29c639d89c691419..3dfad2bb856e0bb65a48dfd70b3458cee4c9beb5 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -156,7 +156,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; i0++) {
       for (auto i1 = begin_1; i1 < end_1; i1++) {
-        functor(i0, i1);
+        if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          functor(i0, i1);
+        else
+          functor(typename Policy::work_tag(), i0, i1);
       }
     }
 #else
@@ -170,7 +173,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
 #pragma omp for collapse(2)
     for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) functor(i0, i1);
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
+        if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          functor(i0, i1);
+        else
+          functor(typename Policy::work_tag(), i0, i1);
+      }
 #endif
   }
 
@@ -192,7 +200,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     for (auto i0 = begin_0; i0 < end_0; i0++) {
       for (auto i1 = begin_1; i1 < end_1; i1++) {
         for (auto i2 = begin_2; i2 < end_2; i2++) {
-          functor(i0, i1, i2);
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, i2);
+          else
+            functor(typename Policy::work_tag(), i0, i1, i2);
         }
       }
     }
@@ -212,7 +223,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #pragma omp for collapse(3)
     for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
       for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) functor(i0, i1, i2);
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, i2);
+          else
+            functor(typename Policy::work_tag(), i0, i1, i2);
+        }
 #endif
   }
 
@@ -237,7 +253,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       for (auto i1 = begin_1; i1 < end_1; i1++) {
         for (auto i2 = begin_2; i2 < end_2; i2++) {
           for (auto i3 = begin_3; i3 < end_3; i3++) {
-            functor(i0, i1, i2, i3);
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, i3);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, i3);
           }
         }
       }
@@ -263,8 +282,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
       for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
         for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            functor(i0, i1, i2, i3);
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, i3);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, i3);
+          }
 #endif
   }
 
@@ -292,7 +315,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
         for (auto i2 = begin_2; i2 < end_2; i2++) {
           for (auto i3 = begin_3; i3 < end_3; i3++) {
             for (auto i4 = begin_4; i4 < end_4; i4++) {
-              functor(i0, i1, i2, i3, i4);
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, i4);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, i4);
             }
           }
         }
@@ -324,8 +351,13 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
         for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
           for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              functor(i0, i1, i2, i3, i4);
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, i4);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, i4);
+            }
 #endif
   }
 
@@ -356,7 +388,14 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           for (auto i3 = begin_3; i3 < end_3; i3++) {
             for (auto i4 = begin_4; i4 < end_4; i4++) {
               for (auto i5 = begin_5; i5 < end_5; i5++) {
-                functor(i0, i1, i2, i3, i4, i5);
+                {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                            i5);
+                }
               }
             }
           }
@@ -394,8 +433,13 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
         for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
           for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
             for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                functor(i0, i1, i2, i3, i4, i5);
+              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, i5);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5);
+              }
 #endif
   }
 
@@ -429,7 +473,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
             for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
               for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
                 for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  functor(i0, i1, i2, i3, i4, i5, i6);
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, i6);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            i6);
                 }
               }
             }
@@ -473,8 +522,14 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
             for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
               for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  functor(i0, i1, i2, i3, i4, i5, i6);
+                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, i6);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            i6);
+                }
 #endif
   }
 
@@ -511,7 +566,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
               for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
                 for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
                   for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    functor(i0, i1, i2, i3, i4, i5, i6, i7);
+                    if constexpr (std::is_same<typename Policy::work_tag,
+                                               void>::value)
+                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
+                    else
+                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                              i5, i6, i7);
                   }
                 }
               }
@@ -561,13 +621,26 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
             for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
               for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
                 for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++)
-                    functor(i0, i1, i2, i3, i4, i5, i6, i7);
+                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
+                    if constexpr (std::is_same<typename Policy::work_tag,
+                                               void>::value)
+                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
+                    else
+                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                              i5, i6, i7);
+                  }
 #endif
   }
 
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
+  // TODO DZP: based on a conversation with Christian, we're using 256 as a
+  // heuristic here. We need something better once we can query these kinds of
+  // properties
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    return 256;
+  }
 };
 
 }  // namespace Impl
@@ -758,6 +831,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
     //  Kokkos::View in HostSpace" );
   }
+  // TODO DZP: based on a conversation with Christian, we're using 256 as a
+heuristic
+  // here. We need something better once we can query these kinds of properties
+  template<typename Policy, typename Functor>
+static int max_tile_size_product(const Policy&, const Functor&) {
+    return 256;
+  }
 };*/
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa348611b953aa62704cb760521a275a04729985
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp
@@ -0,0 +1,135 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP
+#define KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+
+#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <Kokkos_UniqueToken.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template <>
+class UniqueToken<OpenMPTarget, UniqueTokenScope::Global> {
+ protected:
+  uint32_t volatile* m_buffer;
+  uint32_t m_count;
+
+ public:
+  using execution_space = OpenMPTarget;
+  using size_type       = int32_t;
+
+  explicit UniqueToken(execution_space const& = execution_space());
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken(const UniqueToken&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken(UniqueToken&&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken& operator=(const UniqueToken&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken& operator=(UniqueToken&&) = default;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const noexcept { return m_count; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type acquire() const {
+    const Kokkos::pair<int, int> result =
+        Kokkos::Impl::concurrent_bitset::acquire_bounded(
+            m_buffer, m_count, Kokkos::Impl::clock_tic() % m_count);
+
+    if (result.first < 0) {
+      Kokkos::abort(
+          "UniqueToken<OpenMPTarget> failure to acquire tokens, no tokens "
+          "available");
+    }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release(size_type i) const noexcept {
+    Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
+  }
+};
+
+template <>
+class UniqueToken<OpenMPTarget, UniqueTokenScope::Instance>
+    : public UniqueToken<OpenMPTarget, UniqueTokenScope::Global> {
+ private:
+  Kokkos::View<uint32_t*, ::Kokkos::Experimental::OpenMPTargetSpace>
+      m_buffer_view;
+
+ public:
+  explicit UniqueToken(execution_space const& arg = execution_space())
+      : UniqueToken<OpenMPTarget, UniqueTokenScope::Global>(arg) {}
+
+  UniqueToken(size_type max_size, execution_space const& = execution_space())
+      : m_buffer_view(
+            "Kokkos::UniqueToken::m_buffer_view",
+            ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) {
+    m_buffer = m_buffer_view.data();
+    m_count  = max_size;
+  }
+};
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif  // KOKKOS_ENABLE_OPENMPTARGET
+#endif  // KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 567145161c7a1a9a251dd4e056c666863e6f15dc..9c29eb190d17b64c0340751a3459785c070d7c47 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -76,42 +76,75 @@ int get_gpu(const InitArguments& args);
 }  // namespace Impl
 
 namespace Experimental {
-SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton()) {
+SYCL::SYCL()
+    : m_space_instance(&Impl::SYCLInternal::singleton(),
+                       [](Impl::SYCLInternal*) {}) {
   Impl::SYCLInternal::singleton().verify_is_initialized(
       "SYCL instance constructor");
 }
 
+SYCL::SYCL(const sycl::queue& stream)
+    : m_space_instance(new Impl::SYCLInternal, [](Impl::SYCLInternal* ptr) {
+        ptr->finalize();
+        delete ptr;
+      }) {
+  Impl::SYCLInternal::singleton().verify_is_initialized(
+      "SYCL instance constructor");
+  m_space_instance->initialize(stream);
+}
+
 int SYCL::concurrency() {
-  // FIXME_SYCL We need a value larger than 1 here for some tests to pass,
-  // clearly this is true but not the roght value
-  return 2;
+  return Impl::SYCLInternal::singleton().m_maxConcurrency;
 }
 
+const char* SYCL::name() { return "SYCL"; }
+
 bool SYCL::impl_is_initialized() {
   return Impl::SYCLInternal::singleton().is_initialized();
 }
 
 void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); }
 
-void SYCL::fence() const { m_space_instance->m_queue->wait(); }
+void SYCL::fence() const {
+  Impl::SYCLInternal::fence(*m_space_instance->m_queue);
+}
+
+void SYCL::impl_static_fence() {
+  // guard accessing all_queues
+  std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
+  for (auto& queue : Impl::SYCLInternal::all_queues)
+    Impl::SYCLInternal::fence(**queue);
+}
 
 int SYCL::sycl_device() const {
   return impl_internal_space_instance()->m_syclDev;
 }
 
-SYCL::SYCLDevice::SYCLDevice(cl::sycl::device d) : m_device(std::move(d)) {}
+SYCL::SYCLDevice::SYCLDevice(sycl::device d) : m_device(std::move(d)) {}
 
-SYCL::SYCLDevice::SYCLDevice(const cl::sycl::device_selector& selector)
+SYCL::SYCLDevice::SYCLDevice(const sycl::device_selector& selector)
     : m_device(selector.select_device()) {}
 
-cl::sycl::device SYCL::SYCLDevice::get_device() const { return m_device; }
+SYCL::SYCLDevice::SYCLDevice(size_t id) {
+  std::vector<sycl::device> gpu_devices =
+      sycl::device::get_devices(sycl::info::device_type::gpu);
+  if (id >= gpu_devices.size()) {
+    std::stringstream error_message;
+    error_message << "Requested GPU with id " << id << " but only "
+                  << gpu_devices.size() << " GPU(s) available!\n";
+    Kokkos::Impl::throw_runtime_exception(error_message.str());
+  }
+  m_device = gpu_devices[id];
+}
+
+sycl::device SYCL::SYCLDevice::get_device() const { return m_device; }
 
 void SYCL::impl_initialize(SYCL::SYCLDevice d) {
   Impl::SYCLInternal::singleton().initialize(d.get_device());
 }
 
 std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const {
-  using namespace cl::sycl::info;
+  using namespace sycl::info;
   return os << "Name: " << m_device.get_info<device::name>()
             << "\nDriver Version: "
             << m_device.get_info<device::driver_version>()
@@ -227,7 +260,7 @@ std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const {
 
 namespace Impl {
 
-int g_hip_space_factory_initialized =
+int g_sycl_space_factory_initialized =
     Kokkos::Impl::initialize_space_factory<SYCLSpaceInitializer>("170_SYCL");
 
 void SYCLSpaceInitializer::initialize(const InitArguments& args) {
@@ -236,9 +269,13 @@ void SYCLSpaceInitializer::initialize(const InitArguments& args) {
   if (std::is_same<Kokkos::Experimental::SYCL,
                    Kokkos::DefaultExecutionSpace>::value ||
       0 < use_gpu) {
-    // FIXME_SYCL choose a specific device
-    Kokkos::Experimental::SYCL::impl_initialize(
-        Kokkos::Experimental::SYCL::SYCLDevice(cl::sycl::default_selector()));
+    if (use_gpu > -1) {
+      Kokkos::Experimental::SYCL::impl_initialize(
+          Kokkos::Experimental::SYCL::SYCLDevice(use_gpu));
+    } else {
+      Kokkos::Experimental::SYCL::impl_initialize(
+          Kokkos::Experimental::SYCL::SYCLDevice(sycl::default_selector()));
+    }
   }
 }
 
@@ -252,9 +289,7 @@ void SYCLSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void SYCLSpaceInitializer::fence() {
-  // FIXME_SYCL should be
-  //  Kokkos::Experimental::SYCL::impl_static_fence();
-  Kokkos::Experimental::SYCL().fence();
+  Kokkos::Experimental::SYCL::impl_static_fence();
 }
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
similarity index 86%
rename from packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp
rename to packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
index ff53e5a719a7274e8d38b93259286e14bc44d27a..13d6dc1a4a705421a05ce3f86e28f376de0ac41b 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp
@@ -42,13 +42,21 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_TEST_CUDAUVM_HPP
-#define KOKKOS_TEST_CUDAUVM_HPP
+#ifndef KOKKOS_SYCL_ABORT_HPP
+#define KOKKOS_SYCL_ABORT_HPP
 
-#include <gtest/gtest.h>
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_SYCL)
 
-#define TEST_CATEGORY cuda_uvm
-#define TEST_CATEGORY_DEATH cuda_uvm_DeathTest
-#define TEST_EXECSPACE Kokkos::CudaUVMSpace
+namespace Kokkos {
+namespace Impl {
 
+inline void sycl_abort(char const *msg) {
+  KOKKOS_IMPL_DO_NOT_USE_PRINTF("Aborting with message %s.\n", msg);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
 #endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index 95906f8e7e91830e72a7e752bb761cc236f0e06f..aef65ee7ecbbf3c39432b42a42b595dbfe00b239 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -131,6 +131,100 @@ struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
   }
 };
 
+template <>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace,
+                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace,
+                      Kokkos::Experimental::SYCL> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>::DeepCopy;
+};
+
+template <>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace,
+                Kokkos::Experimental::SYCL>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
+                      Kokkos::Experimental::SYCL> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
+                 Kokkos::Experimental::SYCL>::DeepCopy;
+};
+
+template <>
+struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace,
+                Kokkos::Experimental::SYCL>
+    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
+                      Kokkos::Experimental::SYCL> {
+  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>::DeepCopy;
+};
+
+template <>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace,
+                Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace,
+                      Kokkos::Experimental::SYCL> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>::DeepCopy;
+};
+
+template <>
+struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace,
+                      Kokkos::Experimental::SYCL> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>::DeepCopy;
+};
+
+template <class ExecutionSpace>
+struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
+};
+
+template <class ExecutionSpace>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace,
+                Experimental::SYCLDeviceUSMSpace, ExecutionSpace>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
+};
+
+template <class ExecutionSpace>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace,
+                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
+                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
+};
+
+template <class ExecutionSpace>
+struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace>
+    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
+                      ExecutionSpace> {
+  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
+                 ExecutionSpace>::DeepCopy;
+};
+
+template <class ExecutionSpace>
+struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace>
+    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
+                      ExecutionSpace> {
+  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
+                 ExecutionSpace>::DeepCopy;
+};
+
 }  // namespace Impl
 }  // namespace Kokkos
 #endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index ef4f466b8a4ddc34464ab2c4c13db28e92391fb8..5a702b5027277cc7137cba9bba72e7367e9ae97b 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -44,27 +44,28 @@
 
 #include <Kokkos_Concepts.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
+#include <KokkosCore_Config_DeclareBackend.hpp>
 #include <Kokkos_SYCL.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
 #include <impl/Kokkos_Error.hpp>
 
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-int SYCLInternal::was_finalized = 0;
+std::vector<std::optional<sycl::queue>*> SYCLInternal::all_queues;
+std::mutex SYCLInternal::mutex;
 
 SYCLInternal::~SYCLInternal() {
-  if (m_scratchSpace || m_scratchFlags) {
+  if (!was_finalized || m_scratchSpace || m_scratchFlags ||
+      m_scratchConcurrentBitset) {
     std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call "
                  "Kokkos::Experimental::SYCL::finalize()"
               << std::endl;
     std::cerr.flush();
   }
-
-  m_scratchSpace = nullptr;
-  m_scratchFlags = nullptr;
 }
 
 int SYCLInternal::verify_is_initialized(const char* const label) const {
@@ -79,8 +80,26 @@ SYCLInternal& SYCLInternal::singleton() {
   return self;
 }
 
-// FIME_SYCL
-void SYCLInternal::initialize(const cl::sycl::device& d) {
+void SYCLInternal::initialize(const sycl::device& d) {
+  auto exception_handler = [](sycl::exception_list exceptions) {
+    bool asynchronous_error = false;
+    for (std::exception_ptr const& e : exceptions) {
+      try {
+        std::rethrow_exception(e);
+      } catch (sycl::exception const& e) {
+        std::cerr << e.what() << '\n';
+        asynchronous_error = true;
+      }
+    }
+    if (asynchronous_error)
+      Kokkos::Impl::throw_runtime_exception(
+          "There was an asynchronous SYCL error!\n");
+  };
+  initialize(sycl::queue{d, exception_handler});
+}
+
+// FIXME_SYCL
+void SYCLInternal::initialize(const sycl::queue& q) {
   if (was_finalized)
     Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n");
 
@@ -96,9 +115,44 @@ void SYCLInternal::initialize(const cl::sycl::device& d) {
   const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;
   const bool ok_dev  = true;
   if (ok_init && ok_dev) {
-    m_queue = std::make_unique<cl::sycl::queue>(d);
+    m_queue = q;
+    // guard pushing to all_queues
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      all_queues.push_back(&m_queue);
+    }
+    const sycl::device& d = m_queue->get_device();
     std::cout << SYCL::SYCLDevice(d) << '\n';
-    m_indirectKernel.emplace(IndirectKernelAllocator(*m_queue));
+
+    m_maxWorkgroupSize =
+        d.template get_info<sycl::info::device::max_work_group_size>();
+    // FIXME_SYCL this should give the correct value for NVIDIA GPUs
+    m_maxConcurrency =
+        m_maxWorkgroupSize * 2 *
+        d.template get_info<sycl::info::device::max_compute_units>();
+
+    // Setup concurent bitset for obtaining unique tokens from within an
+    // executing kernel.
+    {
+      const int32_t buffer_bound =
+          Kokkos::Impl::concurrent_bitset::buffer_bound(m_maxConcurrency);
+      using Record = Kokkos::Impl::SharedAllocationRecord<
+          Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
+      Record* const r =
+          Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
+                           "Kokkos::SYCL::InternalScratchBitset",
+                           sizeof(uint32_t) * buffer_bound);
+      Record::increment(r);
+      m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data());
+      auto event                = m_queue->memset(m_scratchConcurrentBitset, 0,
+                                   sizeof(uint32_t) * buffer_bound);
+      fence(event);
+    }
+
+    m_maxShmemPerBlock =
+        d.template get_info<sycl::info::device::local_mem_size>();
+    m_indirectKernelMem.reset(*m_queue);
+    m_indirectReducerMem.reset(*m_queue);
   } else {
     std::ostringstream msg;
     msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
@@ -112,16 +166,126 @@ void SYCLInternal::initialize(const cl::sycl::device& d) {
 
 void SYCLInternal::finalize() {
   SYCL().fence();
-  was_finalized = 1;
-  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    // FIXME_SYCL
-    std::abort();
-  }
+  was_finalized = true;
+
+  using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
+  if (nullptr != m_scratchSpace)
+    RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace));
+  if (nullptr != m_scratchFlags)
+    RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags));
+  m_syclDev           = -1;
+  m_scratchSpaceCount = 0;
+  m_scratchSpace      = nullptr;
+  m_scratchFlagsCount = 0;
+  m_scratchFlags      = nullptr;
 
-  m_indirectKernel.reset();
+  RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset));
+  m_scratchConcurrentBitset = nullptr;
+
+  m_indirectKernelMem.reset();
+  m_indirectReducerMem.reset();
+  // guard erasing from all_queues
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    all_queues.erase(std::find(all_queues.begin(), all_queues.end(), &m_queue));
+  }
   m_queue.reset();
 }
 
+void* SYCLInternal::scratch_space(
+    const Kokkos::Experimental::SYCL::size_type size) {
+  const size_type sizeScratchGrain =
+      sizeof(Kokkos::Experimental::SYCL::size_type);
+  if (verify_is_initialized("scratch_space") &&
+      m_scratchSpaceCount * sizeScratchGrain < size) {
+    m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+
+    using Record = Kokkos::Impl::SharedAllocationRecord<
+        Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
+
+    if (nullptr != m_scratchSpace)
+      Record::decrement(Record::get_record(m_scratchSpace));
+
+    Record* const r =
+        Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
+                         "Kokkos::SYCL::InternalScratchSpace",
+                         (sizeScratchGrain * m_scratchSpaceCount));
+
+    Record::increment(r);
+
+    m_scratchSpace = reinterpret_cast<size_type*>(r->data());
+  }
+
+  return m_scratchSpace;
+}
+
+void* SYCLInternal::scratch_flags(
+    const Kokkos::Experimental::SYCL::size_type size) {
+  const size_type sizeScratchGrain =
+      sizeof(Kokkos::Experimental::SYCL::size_type);
+  if (verify_is_initialized("scratch_flags") &&
+      m_scratchFlagsCount * sizeScratchGrain < size) {
+    m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;
+
+    using Record = Kokkos::Impl::SharedAllocationRecord<
+        Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
+
+    if (nullptr != m_scratchFlags)
+      Record::decrement(Record::get_record(m_scratchFlags));
+
+    Record* const r =
+        Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
+                         "Kokkos::SYCL::InternalScratchFlags",
+                         (sizeScratchGrain * m_scratchFlagsCount));
+
+    Record::increment(r);
+
+    m_scratchFlags = reinterpret_cast<size_type*>(r->data());
+  }
+  m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
+  fence(*m_queue);
+
+  return m_scratchFlags;
+}
+
+template <sycl::usm::alloc Kind>
+size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
+  assert(m_size == 0);
+  assert(m_q);
+
+  if (m_capacity < n) {
+    using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
+    // First free what we have (in case malloc can reuse it)
+    if (m_data) Record::decrement(Record::get_record(m_data));
+
+    Record* const r = Record::allocate(AllocationSpace(*m_q),
+                                       "Kokkos::SYCL::USMObjectMem", n);
+    Record::increment(r);
+
+    m_data     = r->data();
+    m_capacity = n;
+  }
+
+  return m_capacity;
+}
+
+template <sycl::usm::alloc Kind>
+void SYCLInternal::USMObjectMem<Kind>::reset() {
+  assert(m_size == 0);
+
+  if (m_data) {
+    using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
+    Record::decrement(Record::get_record(m_data));
+
+    m_capacity = 0;
+    m_data     = nullptr;
+  }
+  m_q.reset();
+}
+
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>;
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>;
+
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index 254d37467966c31896d21b7bd7c4d2d2d4f1b53b..e797411cd40bdd734c04d2a9b0e51151fa269ebd 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -45,9 +45,11 @@
 #ifndef KOKKOS_SYCL_INSTANCE_HPP_
 #define KOKKOS_SYCL_INSTANCE_HPP_
 
-#include <memory>
+#include <optional>
 #include <CL/sycl.hpp>
 
+#include <impl/Kokkos_Error.hpp>
+
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -64,38 +66,273 @@ class SYCLInternal {
   SYCLInternal& operator=(SYCLInternal&&) = delete;
   SYCLInternal(SYCLInternal&&)            = delete;
 
-  int m_syclDev             = -1;
-  size_type* m_scratchSpace = nullptr;
-  size_type* m_scratchFlags = nullptr;
+  void* scratch_space(const size_type size);
+  void* scratch_flags(const size_type size);
+
+  int m_syclDev = -1;
+
+  size_t m_maxWorkgroupSize   = 0;
+  uint32_t m_maxConcurrency   = 0;
+  uint64_t m_maxShmemPerBlock = 0;
+
+  uint32_t* m_scratchConcurrentBitset = nullptr;
+  size_type m_scratchSpaceCount       = 0;
+  size_type* m_scratchSpace           = nullptr;
+  size_type m_scratchFlagsCount       = 0;
+  size_type* m_scratchFlags           = nullptr;
+
+  std::optional<sycl::queue> m_queue;
+
+  // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
+  // compiling for the CUDA backend. Storing pointers instead works around this.
+  static std::vector<std::optional<sycl::queue>*> all_queues;
+  // We need a mutex for thread safety when modifying all_queues.
+  static std::mutex mutex;
+
+  // USMObjectMem is a reusable buffer for a single object
+  // in USM memory
+  template <sycl::usm::alloc Kind>
+  class USMObjectMem {
+   public:
+    class Deleter {
+     public:
+      Deleter() = default;
+      explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
+
+      template <typename T>
+      void operator()(T* p) const noexcept {
+        assert(m_mem);
+        assert(sizeof(T) == m_mem->size());
+
+        if constexpr (sycl::usm::alloc::device == kind)
+          // Only skipping the dtor on trivially copyable types
+          static_assert(std::is_trivially_copyable_v<T>);
+        else
+          p->~T();
+
+        m_mem->m_size = 0;
+      }
+
+     private:
+      USMObjectMem* m_mem = nullptr;
+    };
+
+    static constexpr sycl::usm::alloc kind = Kind;
+
+    void reset();
+
+    void reset(sycl::queue q) {
+      reset();
+      m_q.emplace(std::move(q));
+    }
+
+    USMObjectMem() = default;
+    explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
+
+    USMObjectMem(USMObjectMem const&) = delete;
+    USMObjectMem(USMObjectMem&&)      = delete;
+    USMObjectMem& operator=(USMObjectMem&&) = delete;
+    USMObjectMem& operator=(USMObjectMem const&) = delete;
+
+    ~USMObjectMem() { reset(); };
+
+    void* data() noexcept { return m_data; }
+    const void* data() const noexcept { return m_data; }
+
+    size_t size() const noexcept { return m_size; }
+    size_t capacity() const noexcept { return m_capacity; }
+
+    // reserve() allocates space for at least n bytes
+    // returns the new capacity
+    size_t reserve(size_t n);
+
+   private:
+    using AllocationSpace =
+        std::conditional_t<Kind == sycl::usm::alloc::device,
+                           Kokkos::Experimental::SYCLDeviceUSMSpace,
+                           Kokkos::Experimental::SYCLSharedUSMSpace>;
+
+    // This will memcpy an object T into memory held by this object
+    // returns: a T* to that object
+    //
+    // Note:  it is UB to dereference this pointer with an object that is
+    // not an implicit-lifetime nor trivially-copyable type, but presumably much
+    // faster because we can use USM device memory
+    template <typename T>
+    std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
+      reserve(sizeof(T));
+      sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
+      fence(memcopied);
+
+      std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
+                                      Deleter(this));
+      m_size = sizeof(T);
+      return ptr;
+    }
+
+    // This will copy-constuct an object T into memory held by this object
+    // returns: a unique_ptr<T, destruct_delete> that will call the
+    // destructor on the type when it goes out of scope.
+    //
+    // Note:  This will not work with USM device memory
+    template <typename T>
+    std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
+      static_assert(kind != sycl::usm::alloc::device,
+                    "Cannot copy construct into USM device memory");
+
+      reserve(sizeof(T));
+
+      std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
+      m_size = sizeof(T);
+      return ptr;
+    }
+
+   public:
+    // Performs either memcpy (for USM device memory) and returns a T*
+    // (but is technically UB when dereferenced on an object that is not
+    // an implicit-lifetime nor trivially-copyable type
+    //
+    // or
+    //
+    // performs copy construction (for other USM memory types) and returns a
+    // unique_ptr<T, ...>
+    template <typename T>
+    std::unique_ptr<T, Deleter> copy_from(const T& t) {
+      if constexpr (sycl::usm::alloc::device == kind)
+        return memcpy_from(t);
+      else
+        return copy_construct_from(t);
+    }
 
-  std::unique_ptr<cl::sycl::queue> m_queue;
+   private:
+    // Returns a reference to t (helpful when debugging)
+    template <typename T>
+    T& memcpy_to(T& t) {
+      assert(sizeof(T) == m_size);
+
+      sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
+      fence(memcopied);
+
+      return t;
+    }
+
+    // Returns a reference to t (helpful when debugging)
+    template <typename T>
+    T& move_assign_to(T& t) {
+      static_assert(kind != sycl::usm::alloc::device,
+                    "Cannot move_assign_to from USM device memory");
+
+      assert(sizeof(T) == m_size);
+
+      t = std::move(*static_cast<T*>(m_data));
+
+      return t;
+    }
+
+   public:
+    // Returns a reference to t (helpful when debugging)
+    template <typename T>
+    T& transfer_to(T& t) {
+      if constexpr (sycl::usm::alloc::device == kind)
+        return memcpy_to(t);
+      else
+        return move_assign_to(t);
+    }
+
+   private:
+    // USMObjectMem class invariants
+    // All four expressions below must evaluate to true:
+    //
+    //  !m_data == !m_capacity
+    //  m_q || !m_data
+    //  m_data || !m_size
+    //  m_size <= m_capacity
+    //
+    //  The above invariants mean that:
+    //  if m_size != 0 then m_data != 0
+    //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
+    //  if m_data == 0 then m_capacity == 0
+
+    std::optional<sycl::queue> m_q;
+    void* m_data      = nullptr;
+    size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
+    size_t m_capacity = 0;
+  };
 
   // An indirect kernel is one where the functor to be executed is explicitly
-  // created in USM shared memory before being executed, to get around the
+  // copied to USM device memory before being executed, to get around the
   // trivially copyable limitation of SYCL.
-  //
-  // m_indirectKernel just manages the memory as a reuseable buffer.  It is
-  // stored in an optional because the allocator contains a queue
-  using IndirectKernelAllocator =
-      sycl::usm_allocator<std::byte, sycl::usm::alloc::shared>;
-  using IndirectKernelMemory =
-      std::vector<IndirectKernelAllocator::value_type, IndirectKernelAllocator>;
-  using IndirectKernel = std::optional<IndirectKernelMemory>;
-  IndirectKernel m_indirectKernel;
-
-  static int was_finalized;
+  using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
+  IndirectKernelMem m_indirectKernelMem;
+
+  using IndirectReducerMem = USMObjectMem<sycl::usm::alloc::shared>;
+  IndirectReducerMem m_indirectReducerMem;
+
+  bool was_finalized = false;
 
   static SYCLInternal& singleton();
 
   int verify_is_initialized(const char* const label) const;
 
-  void initialize(const cl::sycl::device& d);
+  void initialize(const sycl::device& d);
+
+  void initialize(const sycl::queue& q);
 
-  int is_initialized() const { return m_queue != nullptr; }
+  int is_initialized() const { return m_queue.has_value(); }
 
   void finalize();
+
+ private:
+  // fence(...) takes any type with a .wait_and_throw() method
+  // (sycl::event and sycl::queue)
+  template <typename WAT>
+  static void fence_helper(WAT& wat) {
+    try {
+      wat.wait_and_throw();
+    } catch (sycl::exception const& e) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("There was a synchronous SYCL error:\n") += e.what());
+    }
+  }
+
+ public:
+  static void fence(sycl::queue& q) { fence_helper(q); }
+  static void fence(sycl::event& e) { fence_helper(e); }
+};
+
+template <typename Functor, typename Storage,
+          bool is_memcpyable = std::is_trivially_copyable_v<Functor>>
+class SYCLFunctionWrapper;
+
+template <typename Functor, typename Storage>
+class SYCLFunctionWrapper<Functor, Storage, true> {
+  const Functor& m_functor;
+
+ public:
+  SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
+
+  const Functor& get_functor() const { return m_functor; }
+};
+
+template <typename Functor, typename Storage>
+class SYCLFunctionWrapper<Functor, Storage, false> {
+  std::unique_ptr<Functor,
+                  Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
+      m_kernelFunctorPtr;
+
+ public:
+  SYCLFunctionWrapper(const Functor& functor, Storage& storage)
+      : m_kernelFunctorPtr(storage.copy_from(functor)) {}
+
+  std::reference_wrapper<const Functor> get_functor() const {
+    return {*m_kernelFunctorPtr};
+  }
 };
 
+template <typename Functor, typename Storage>
+auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) {
+  return SYCLFunctionWrapper<Functor, Storage>(functor, storage);
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e90ec1fb50b21e92f4f2ce589f98e2e755967ea
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp
@@ -0,0 +1,37 @@
+#ifndef KOKKOS_SYCL_MDRANGEPOLICY_HPP_
+#define KOKKOS_SYCL_MDRANGEPOLICY_HPP_
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+
+template <>
+struct default_outer_direction<Kokkos::Experimental::SYCL> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+template <>
+struct default_inner_direction<Kokkos::Experimental::SYCL> {
+  using type                     = Iterate;
+  static constexpr Iterate value = Iterate::Left;
+};
+
+namespace Impl {
+
+// Settings for MDRangePolicy
+template <>
+inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::SYCL>(
+    const Kokkos::Experimental::SYCL& space) {
+  TileSizeProperties properties;
+  properties.max_threads =
+      space.impl_internal_space_instance()->m_maxWorkgroupSize;
+  properties.default_largest_tile_size = 16;
+  properties.default_tile_size         = 2;
+  properties.max_total_tile_size       = properties.max_threads;
+  return properties;
+}
+
+}  // Namespace Impl
+}  // Namespace Kokkos
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index ba42c36d39a277d47b79bb3dab3775b6f9cd156a..a286169c45988339dce1b14c6d6a4ffde25dcea5 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -45,6 +45,8 @@
 #ifndef KOKKOS_SYCL_PARALLEL_RANGE_HPP_
 #define KOKKOS_SYCL_PARALLEL_RANGE_HPP_
 
+#include <impl/KokkosExp_IterateTileGPU.hpp>
+
 template <class FunctorType, class ExecPolicy>
 class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
                                 Kokkos::Experimental::SYCL> {
@@ -59,27 +61,22 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
   const FunctorType m_functor;
   const Policy m_policy;
 
- private:
-  ParallelFor()        = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
   template <typename Functor>
   static void sycl_direct_launch(const Policy& policy, const Functor& functor) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
-    cl::sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = *instance.m_queue;
 
-    q.wait();
+    space.fence();
 
-    q.submit([functor, policy](cl::sycl::handler& cgh) {
-      cl::sycl::range<1> range(policy.end() - policy.begin());
+    q.submit([functor, policy](sycl::handler& cgh) {
+      sycl::range<1> range(policy.end() - policy.begin());
+      const auto begin = policy.begin();
 
-      cgh.parallel_for(range, [=](cl::sycl::item<1> item) {
-        const typename Policy::index_type id =
-            static_cast<typename Policy::index_type>(item.get_linear_id()) +
-            policy.begin();
+      cgh.parallel_for(range, [=](sycl::item<1> item) {
+        const typename Policy::index_type id = item.get_linear_id() + begin;
         if constexpr (std::is_same<WorkTag, void>::value)
           functor(id);
         else
@@ -87,47 +84,188 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
       });
     });
 
-    q.wait();
+    space.fence();
+  }
+
+ public:
+  using functor_type = FunctorType;
+
+  void execute() const {
+    if (m_policy.begin() == m_policy.end()) return;
+
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = m_policy.space()
+                                .impl_internal_space_instance()
+                                ->m_indirectKernelMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+  }
+
+  ParallelFor(const ParallelFor&) = delete;
+  ParallelFor(ParallelFor&&)      = delete;
+  ParallelFor& operator=(const ParallelFor&) = delete;
+  ParallelFor& operator=(ParallelFor&&) = delete;
+  ~ParallelFor()                        = default;
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+// ParallelFor
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                                Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = Kokkos::MDRangePolicy<Traits...>;
+
+ private:
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;
+  using LaunchBounds     = typename Policy::launch_bounds;
+  using WorkTag          = typename Policy::work_tag;
+
+  const FunctorType m_functor;
+  // MDRangePolicy is not trivially copyable. Hence, replicate the data we
+  // really need in DeviceIterateTile in a trivially copyable struct.
+  const struct BarePolicy {
+    using index_type = typename Policy::index_type;
+
+    BarePolicy(const Policy& policy)
+        : m_lower(policy.m_lower),
+          m_upper(policy.m_upper),
+          m_tile(policy.m_tile),
+          m_tile_end(policy.m_tile_end),
+          m_num_tiles(policy.m_num_tiles) {}
+
+    const typename Policy::point_type m_lower;
+    const typename Policy::point_type m_upper;
+    const typename Policy::tile_type m_tile;
+    const typename Policy::point_type m_tile_end;
+    const typename Policy::index_type m_num_tiles;
+    static constexpr Iterate inner_direction = Policy::inner_direction;
+  } m_policy;
+  const Kokkos::Experimental::SYCL& m_space;
+
+  sycl::nd_range<3> compute_ranges() const {
+    const auto& m_tile     = m_policy.m_tile;
+    const auto& m_tile_end = m_policy.m_tile_end;
+
+    if constexpr (Policy::rank == 2) {
+      sycl::range<3> local_sizes(m_tile[0], m_tile[1], 1);
+      sycl::range<3> global_sizes(m_tile_end[0] * m_tile[0],
+                                  m_tile_end[1] * m_tile[1], 1);
+      return {global_sizes, local_sizes};
+    }
+    if constexpr (Policy::rank == 3) {
+      sycl::range<3> local_sizes(m_tile[0], m_tile[1], m_tile[2]);
+      sycl::range<3> global_sizes(m_tile_end[0] * m_tile[0],
+                                  m_tile_end[1] * m_tile[1],
+                                  m_tile_end[2] * m_tile[2]);
+      return {global_sizes, local_sizes};
+    }
+    if constexpr (Policy::rank == 4) {
+      // id0,id1 encoded within first index; id2 to second index; id3 to third
+      // index
+      sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2], m_tile[3]);
+      sycl::range<3> global_sizes(
+          m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1],
+          m_tile_end[2] * m_tile[2], m_tile_end[3] * m_tile[3]);
+      return {global_sizes, local_sizes};
+    }
+    if constexpr (Policy::rank == 5) {
+      // id0,id1 encoded within first index; id2,id3 to second index; id4 to
+      // third index
+      sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3],
+                                 m_tile[4]);
+      sycl::range<3> global_sizes(
+          m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1],
+          m_tile_end[2] * m_tile[2] * m_tile_end[3] * m_tile[3],
+          m_tile_end[4] * m_tile[4]);
+      return {global_sizes, local_sizes};
+    }
+    if constexpr (Policy::rank == 6) {
+      // id0,id1 encoded within first index; id2,id3 to second index; id4,id5 to
+      // third index
+      sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3],
+                                 m_tile[4] * m_tile[5]);
+      sycl::range<3> global_sizes(
+          m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1],
+          m_tile_end[2] * m_tile[2] * m_tile_end[3] * m_tile[3],
+          m_tile_end[4] * m_tile[4] * m_tile_end[5] * m_tile[5]);
+      return {global_sizes, local_sizes};
+    }
+    static_assert(Policy::rank > 1 && Policy::rank < 7,
+                  "Kokkos::MDRange Error: Exceeded rank bounds with SYCL\n");
   }
 
-  // Indirectly launch a functor by explicitly creating it in USM shared memory
-  void sycl_indirect_launch() const {
+  template <typename Functor>
+  void sycl_direct_launch(const Functor& functor) const {
     // Convenience references
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMemory& kernelMem =
-        *instance.m_indirectKernel;
+        *m_space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+
+    m_space.fence();
 
-    // Allocate USM shared memory for the functor
-    kernelMem.resize(std::max(kernelMem.size(), sizeof(m_functor)));
+    if (m_policy.m_num_tiles == 0) return;
 
-    // Placement new a copy of functor into USM shared memory
-    //
-    // Store it in a unique_ptr to call its destructor on scope exit
-    std::unique_ptr<FunctorType, Kokkos::Impl::destruct_delete>
-        kernelFunctorPtr(new (kernelMem.data()) FunctorType(m_functor));
+    const BarePolicy bare_policy(m_policy);
 
-    // Use reference_wrapper (because it is both trivially copyable and
-    // invocable) and launch it
-    sycl_direct_launch(m_policy, std::reference_wrapper(*kernelFunctorPtr));
+    q.submit([functor, this, bare_policy](sycl::handler& cgh) {
+      const auto range = compute_ranges();
+
+      cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) {
+        const index_type local_x    = item.get_local_id(0);
+        const index_type local_y    = item.get_local_id(1);
+        const index_type local_z    = item.get_local_id(2);
+        const index_type global_x   = item.get_group(0);
+        const index_type global_y   = item.get_group(1);
+        const index_type global_z   = item.get_group(2);
+        const index_type n_global_x = item.get_group_range(0);
+        const index_type n_global_y = item.get_group_range(1);
+        const index_type n_global_z = item.get_group_range(2);
+
+        Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
+                                        typename Policy::work_tag>(
+            bare_policy, functor, {n_global_x, n_global_y, n_global_z},
+            {global_x, global_y, global_z}, {local_x, local_y, local_z})
+            .exec_range();
+      });
+    });
+
+    m_space.fence();
   }
 
  public:
   using functor_type = FunctorType;
 
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& policy, const Functor&) {
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
+  }
+
   void execute() const {
-    // if the functor is trivially copyable, we can launch it directly;
-    // otherwise, we will launch it indirectly via explicitly creating
-    // it in USM shared memory.
-    if constexpr (std::is_trivially_copyable_v<decltype(m_functor)>)
-      sycl_direct_launch(m_policy, m_functor);
-    else
-      sycl_indirect_launch();
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem =
+            m_space.impl_internal_space_instance()->m_indirectKernelMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    sycl_direct_launch(functor_wrapper.get_functor());
   }
 
+  ParallelFor(const ParallelFor&) = delete;
+  ParallelFor(ParallelFor&&)      = delete;
+  ParallelFor& operator=(const ParallelFor&) = delete;
+  ParallelFor& operator=(ParallelFor&&) = delete;
+  ~ParallelFor()                        = default;
+
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_space(arg_policy.space()) {}
 };
 
 #endif  // KOKKOS_SYCL_PARALLEL_RANGE_HPP_
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index ac5e8154f30ad1cb093e652e704454dee3b751b1..03b7753f8e81ef5045b16cedd4206d85174c0033 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -69,14 +69,6 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using reference_type  = typename Analysis::reference_type;
 
   using WorkTag = typename Policy::work_tag;
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-  using ValueInit =
-      typename Kokkos::Impl::FunctorValueInit<FunctorType, WorkTagFwd>;
 
  public:
   // V - View
@@ -94,198 +86,503 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr(reducer.view().data()) {}
 
  private:
-  template <typename TagType>
-  std::enable_if_t<std::is_void<TagType>::value> exec(reference_type update) {
-    using member_type = typename Policy::member_type;
-    member_type e     = m_policy.end();
-    for (member_type i = m_policy.begin(); i < e; ++i) m_functor(i, update);
+  template <typename PolicyType, typename Functor, typename Reducer>
+  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
+                          const Reducer& reducer) const {
+    using ReducerConditional =
+        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                           FunctorType, ReducerType>;
+    using ReducerTypeFwd = typename ReducerConditional::type;
+    using WorkTagFwd =
+        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                           WorkTag, void>;
+    using ValueInit =
+        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+    using ValueJoin =
+        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
+
+    auto selected_reducer = ReducerConditional::select(functor, reducer);
+
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+
+    // FIXME_SYCL optimize
+    constexpr size_t wgroup_size       = 128;
+    constexpr size_t values_per_thread = 2;
+    std::size_t size                   = policy.end() - policy.begin();
+    const auto init_size               = std::max<std::size_t>(
+        ((size + values_per_thread - 1) / values_per_thread + wgroup_size - 1) /
+            wgroup_size,
+        1);
+    const unsigned int value_count =
+        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
+            selected_reducer);
+    // FIXME_SYCL only use the first half
+    const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
+        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
+    // FIXME_SYCL without this we are running into a race condition
+    const auto results_ptr2 =
+        results_ptr + std::max(value_count, 1u) * init_size;
+
+    // If size<=1 we only call init(), the functor and possibly final once
+    // working with the global scratch memory but don't copy back to
+    // m_result_ptr yet.
+    if (size <= 1) {
+      q.submit([&](sycl::handler& cgh) {
+        const auto begin = policy.begin();
+        cgh.single_task([=]() {
+          const auto& selected_reducer = ReducerConditional::select(
+              static_cast<const FunctorType&>(functor),
+              static_cast<const ReducerType&>(reducer));
+          reference_type update =
+              ValueInit::init(selected_reducer, results_ptr);
+          if (size == 1) {
+            if constexpr (std::is_same<WorkTag, void>::value)
+              functor(begin, update);
+            else
+              functor(WorkTag(), begin, update);
+          }
+          if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+            FunctorFinal<FunctorType, WorkTag>::final(
+                static_cast<const FunctorType&>(functor), results_ptr);
+        });
+      });
+      space.fence();
+    }
+
+    // Otherwise, we perform a reduction on the values in all workgroups
+    // separately, write the workgroup results back to global memory and recurse
+    // until only one workgroup does the reduction and thus gets the final
+    // value.
+    bool first_run = true;
+    while (size > 1) {
+      auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
+                        wgroup_size - 1) /
+                       wgroup_size;
+      q.submit([&](sycl::handler& cgh) {
+        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
+                      cgh);
+        const auto begin = policy.begin();
+
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            [=](sycl::nd_item<1> item) {
+              const auto local_id = item.get_local_linear_id();
+              const auto global_id =
+                  wgroup_size * item.get_group_linear_id() * values_per_thread +
+                  local_id;
+              const auto& selected_reducer = ReducerConditional::select(
+                  static_cast<const FunctorType&>(functor),
+                  static_cast<const ReducerType&>(reducer));
+
+              // In the first iteration, we call functor to initialize the local
+              // memory. Otherwise, the local memory is initialized with the
+              // results from the previous iteration that are stored in global
+              // memory. Note that we load values_per_thread values per thread
+              // and immediately combine them to avoid too many threads being
+              // idle in the actual workgroup reduction.
+              using index_type       = typename Policy::index_type;
+              const auto upper_bound = std::min<index_type>(
+                  global_id + values_per_thread * wgroup_size, size);
+              if (first_run) {
+                reference_type update = ValueInit::init(
+                    selected_reducer, &local_mem[local_id * value_count]);
+                for (index_type id = global_id; id < upper_bound;
+                     id += wgroup_size) {
+                  if constexpr (std::is_same<WorkTag, void>::value)
+                    functor(id + begin, update);
+                  else
+                    functor(WorkTag(), id + begin, update);
+                }
+              } else {
+                if (global_id >= size)
+                  ValueInit::init(selected_reducer,
+                                  &local_mem[local_id * value_count]);
+                else {
+                  ValueOps::copy(functor, &local_mem[local_id * value_count],
+                                 &results_ptr[global_id * value_count]);
+                  for (index_type id = global_id + wgroup_size;
+                       id < upper_bound; id += wgroup_size) {
+                    ValueJoin::join(selected_reducer,
+                                    &local_mem[local_id * value_count],
+                                    &results_ptr[id * value_count]);
+                  }
+                }
+              }
+              item.barrier(sycl::access::fence_space::local_space);
+
+              // Perform the actual workgroup reduction. To achieve a better
+              // memory access pattern, we use sequential addressing and a
+              // reversed loop. If the workgroup size is 8, the first element
+              // contains all the values with index%4==0, after the second one
+              // the values with index%2==0 and after the third one index%1==0,
+              // i.e., all values.
+              for (unsigned int stride = wgroup_size / 2; stride > 0;
+                   stride >>= 1) {
+                const auto idx = local_id;
+                if (idx < stride) {
+                  ValueJoin::join(selected_reducer,
+                                  &local_mem[idx * value_count],
+                                  &local_mem[(idx + stride) * value_count]);
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+              }
+
+              // Finally, we copy the workgroup results back to global memory to
+              // be used in the next iteration. If this is the last iteration,
+              // i.e., there is only one workgroup also call final() if
+              // necessary.
+              if (local_id == 0) {
+                ValueOps::copy(
+                    functor,
+                    &results_ptr2[(item.get_group_linear_id()) * value_count],
+                    &local_mem[0]);
+                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+                  if (n_wgroups <= 1)
+                    FunctorFinal<FunctorType, WorkTag>::final(
+                        static_cast<const FunctorType&>(functor),
+                        &results_ptr2[(item.get_group_linear_id()) *
+                                      value_count]);
+              }
+            });
+      });
+      space.fence();
+
+      // FIXME_SYCL this is likely not necessary, see above
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, results_ptr, results_ptr2,
+          sizeof(*m_result_ptr) * value_count * n_wgroups);
+      space.fence();
+
+      first_run = false;
+      size      = n_wgroups;
+    }
+
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+      space.fence();
+    }
   }
 
-  template <typename TagType>
-  std::enable_if_t<!std::is_void<TagType>::value> exec(reference_type update) {
-    using member_type = typename Policy::member_type;
-    member_type e     = m_policy.end();
-    for (member_type i = m_policy.begin(); i < e; ++i)
-      m_functor(TagType{}, i, update);
+ public:
+  void execute() const {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_policy.space().impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem  = instance.m_indirectKernelMem;
+    IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_reducer, indirectReducerMem);
+
+    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
+                       reducer_wrapper.get_functor());
   }
 
-  template <typename T>
-  struct ExtendedReferenceWrapper : std::reference_wrapper<T> {
-    using std::reference_wrapper<T>::reference_wrapper;
+ private:
+  FunctorType m_functor;
+  Policy m_policy;
+  ReducerType m_reducer;
+  pointer_type m_result_ptr;
+};
 
-    using value_type = typename FunctorValueTraits<T, WorkTag>::value_type;
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+                     Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = Kokkos::MDRangePolicy<Traits...>;
 
-    template <typename Dummy = T>
-    std::enable_if_t<std::is_same_v<Dummy, T> &&
-                     ReduceFunctorHasInit<Dummy>::value>
-    init(value_type& old_value, const value_type& new_value) const {
-      return this->get().init(old_value, new_value);
-    }
+ private:
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+  using execution_space = typename Analysis::execution_space;
+  using value_type      = typename Analysis::value_type;
+  using pointer_type    = typename Analysis::pointer_type;
+  using reference_type  = typename Analysis::reference_type;
 
-    template <typename Dummy = T>
-    std::enable_if_t<std::is_same_v<Dummy, T> &&
-                     ReduceFunctorHasJoin<Dummy>::value>
-    join(value_type& old_value, const value_type& new_value) const {
-      return this->get().join(old_value, new_value);
-    }
+  using WorkTag = typename Policy::work_tag;
 
-    template <typename Dummy = T>
-    std::enable_if_t<std::is_same_v<Dummy, T> &&
-                     ReduceFunctorHasFinal<Dummy>::value>
-    final(value_type& old_value) const {
-      return this->get().final(old_value);
-    }
+  // MDRangePolicy is not trivially copyable. Hence, replicate the data we
+  // really need in DeviceIterateTile in a trivially copyable struct.
+  struct BarePolicy {
+    using index_type = typename Policy::index_type;
+
+    BarePolicy(const Policy& policy)
+        : m_lower(policy.m_lower),
+          m_upper(policy.m_upper),
+          m_tile(policy.m_tile),
+          m_tile_end(policy.m_tile_end),
+          m_num_tiles(policy.m_num_tiles),
+          m_prod_tile_dims(policy.m_prod_tile_dims) {}
+
+    const typename Policy::point_type m_lower;
+    const typename Policy::point_type m_upper;
+    const typename Policy::tile_type m_tile;
+    const typename Policy::point_type m_tile_end;
+    const typename Policy::index_type m_num_tiles;
+    const typename Policy::index_type m_prod_tile_dims;
+    static constexpr Iterate inner_direction = Policy::inner_direction;
+    static constexpr int rank                = Policy::rank;
   };
 
-  template <typename PolicyType, typename Functor>
-  void sycl_direct_launch(const PolicyType& policy,
-                          const Functor& functor) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    cl::sycl::queue& q = *instance.m_queue;
-
-    auto result_ptr = static_cast<pointer_type>(
-        sycl::malloc(sizeof(*m_result_ptr), q, sycl::usm::alloc::shared));
-
-    value_type identity{};
-    if constexpr (!std::is_same<ReducerType, InvalidType>::value)
-      m_reducer.init(identity);
-
-    *result_ptr = identity;
-    if constexpr (ReduceFunctorHasInit<Functor>::value)
-      ValueInit::init(functor, result_ptr);
-
-    q.submit([&](cl::sycl::handler& cgh) {
-      // FIXME_SYCL a local size larger than 1 doesn't work for all cases
-      cl::sycl::nd_range<1> range(policy.end() - policy.begin(), 1);
-
-      const auto reduction = [&]() {
-        if constexpr (!std::is_same<ReducerType, InvalidType>::value) {
-          return cl::sycl::ONEAPI::reduction(
-              result_ptr, identity,
-              [this](value_type& old_value, const value_type& new_value) {
-                m_reducer.join(old_value, new_value);
-                return old_value;
-              });
-        } else {
-          if constexpr (ReduceFunctorHasJoin<Functor>::value) {
-            return cl::sycl::ONEAPI::reduction(
-                result_ptr, identity,
-                [functor](value_type& old_value, const value_type& new_value) {
-                  functor.join(old_value, new_value);
-                  return old_value;
-                });
-          } else {
-            return cl::sycl::ONEAPI::reduction(result_ptr, identity,
-                                               std::plus<>());
-          }
-        }
-      }();
-
-      cgh.parallel_for(range, reduction,
-                       [=](cl::sycl::nd_item<1> item, auto& sum) {
-                         const typename Policy::index_type id =
-                             static_cast<typename Policy::index_type>(
-                                 item.get_global_id(0)) +
-                             policy.begin();
-                         value_type partial = identity;
-                         if constexpr (std::is_same<WorkTag, void>::value)
-                           functor(id, partial);
-                         else
-                           functor(WorkTag(), id, partial);
-                         sum.combine(partial);
-                       });
-    });
-
-    q.wait();
-
-    static_assert(ReduceFunctorHasFinal<Functor>::value ==
-                  ReduceFunctorHasFinal<FunctorType>::value);
-    static_assert(ReduceFunctorHasJoin<Functor>::value ==
-                  ReduceFunctorHasJoin<FunctorType>::value);
-
-    if constexpr (ReduceFunctorHasFinal<Functor>::value)
-      FunctorFinal<Functor, WorkTag>::final(functor, result_ptr);
-    else
-      *m_result_ptr = *result_ptr;
-
-    sycl::free(result_ptr, q);
-  }
+ public:
+  // V - View
+  template <typename V>
+  ParallelReduce(
+      const FunctorType& f, const Policy& p, const V& v,
+      typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
+      : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {}
+
+  ParallelReduce(const FunctorType& f, const Policy& p,
+                 const ReducerType& reducer)
+      : m_functor(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()) {}
+
+ private:
+  template <typename PolicyType, typename Functor, typename Reducer>
+  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
+                          const Reducer& reducer) const {
+    using ReducerConditional =
+        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                           FunctorType, ReducerType>;
+    using ReducerTypeFwd = typename ReducerConditional::type;
+    using WorkTagFwd =
+        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                           WorkTag, void>;
+    using ValueInit =
+        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+    using ValueJoin =
+        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
 
-  template <typename Functor>
-  void sycl_indirect_launch(const Functor& functor) const {
     // Convenience references
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMemory& kernelMem =
-        *instance.m_indirectKernel;
+        *m_space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+
+    const int nwork = m_policy.m_num_tiles;
+    const int block_size =
+        std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
+
+    const sycl::range<1> local_range(block_size);
+    // REMEMBER swap local x<->y to be conforming with Cuda/HIP implementation
+    const sycl::range<1> global_range(nwork * block_size);
+    const sycl::nd_range<1> range{global_range, local_range};
+
+    const size_t wgroup_size = range.get_local_range().size();
+    size_t size              = range.get_global_range().size();
+    const auto init_size =
+        std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
+    const auto& selected_reducer = ReducerConditional::select(functor, reducer);
+    const unsigned int value_count =
+        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
+            selected_reducer);
+    // FIXME_SYCL only use the first half
+    const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
+        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
+    // FIXME_SYCL without this we are running into a race condition
+    const auto results_ptr2 =
+        results_ptr + std::max(value_count, 1u) * init_size;
+
+    // If size<=1 we only call init(), the functor and possibly final once
+    // working with the global scratch memory but don't copy back to
+    // m_result_ptr yet.
+    if (size <= 1) {
+      q.submit([&](sycl::handler& cgh) {
+        cgh.single_task([=]() {
+          const auto& selected_reducer = ReducerConditional::select(
+              static_cast<const FunctorType&>(functor),
+              static_cast<const ReducerType&>(reducer));
+          reference_type update =
+              ValueInit::init(selected_reducer, results_ptr);
+          if (size == 1) {
+            Kokkos::Impl::Reduce::DeviceIterateTile<
+                Policy::rank, BarePolicy, Functor, typename Policy::work_tag,
+                reference_type>(policy, functor, update, {1, 1, 1}, {0, 0, 0},
+                                {0, 0, 0})
+                .exec_range();
+          }
+          if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+            FunctorFinal<FunctorType, WorkTag>::final(
+                static_cast<const FunctorType&>(functor), results_ptr);
+        });
+      });
+      m_space.fence();
+    }
 
-    // Allocate USM shared memory for the functor
-    kernelMem.resize(std::max(kernelMem.size(), sizeof(functor)));
+    // Otherwise, we perform a reduction on the values in all workgroups
+    // separately, write the workgroup results back to global memory and recurse
+    // until only one workgroup does the reduction and thus gets the final
+    // value.
+    bool first_run = true;
+    while (size > 1) {
+      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
+      q.submit([&](sycl::handler& cgh) {
+        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
+                      cgh);
+
+        const BarePolicy bare_policy = m_policy;
+
+        cgh.parallel_for(range, [=](sycl::nd_item<1> item) {
+          const auto local_id = item.get_local_linear_id();
+          const auto global_id =
+              wgroup_size * item.get_group_linear_id() + local_id;
+          const auto& selected_reducer = ReducerConditional::select(
+              static_cast<const FunctorType&>(functor),
+              static_cast<const ReducerType&>(reducer));
+
+          // In the first iteration, we call functor to initialize the local
+          // memory. Otherwise, the local memory is initialized with the
+          // results from the previous iteration that are stored in global
+          // memory.
+          using index_type = typename Policy::index_type;
+          const auto upper_bound =
+              std::min<index_type>(global_id + wgroup_size, size);
+          if (first_run) {
+            reference_type update = ValueInit::init(
+                selected_reducer, &local_mem[local_id * value_count]);
+
+            // SWAPPED here to be conforming with CUDA implementation
+            const index_type local_x    = 0;
+            const index_type local_y    = item.get_local_id(0);
+            const index_type local_z    = 0;
+            const index_type global_x   = item.get_group(0);
+            const index_type global_y   = 0;
+            const index_type global_z   = 0;
+            const index_type n_global_x = item.get_group_range(0);
+            const index_type n_global_y = 1;
+            const index_type n_global_z = 1;
+
+            Kokkos::Impl::Reduce::DeviceIterateTile<
+                Policy::rank, BarePolicy, Functor, typename Policy::work_tag,
+                reference_type>(bare_policy, functor, update,
+                                {n_global_x, n_global_y, n_global_z},
+                                {global_x, global_y, global_z},
+                                {local_x, local_y, local_z})
+                .exec_range();
+          } else {
+            if (global_id >= size)
+              ValueInit::init(selected_reducer,
+                              &local_mem[local_id * value_count]);
+            else {
+              ValueOps::copy(functor, &local_mem[local_id * value_count],
+                             &results_ptr[global_id * value_count]);
+              for (index_type id = global_id + wgroup_size; id < upper_bound;
+                   id += wgroup_size) {
+                ValueJoin::join(selected_reducer,
+                                &local_mem[local_id * value_count],
+                                &results_ptr[id * value_count]);
+              }
+            }
+          }
+          item.barrier(sycl::access::fence_space::local_space);
+
+          // Perform the actual workgroup reduction. To achieve a better
+          // memory access pattern, we use sequential addressing and a
+          // reversed loop. If the workgroup size is 8, the first element
+          // contains all the values with index%4==0, after the second one
+          // the values with index%2==0 and after the third one index%1==0,
+          // i.e., all values.
+          for (unsigned int stride = wgroup_size / 2; stride > 0;
+               stride >>= 1) {
+            const auto idx = local_id;
+            if (idx < stride) {
+              ValueJoin::join(selected_reducer, &local_mem[idx * value_count],
+                              &local_mem[(idx + stride) * value_count]);
+            }
+            item.barrier(sycl::access::fence_space::local_space);
+          }
 
-    // Placement new a copy of functor into USM shared memory
-    //
-    // Store it in a unique_ptr to call its destructor on scope exit
-    std::unique_ptr<Functor, Kokkos::Impl::destruct_delete> kernelFunctorPtr(
-        new (kernelMem.data()) Functor(functor));
+          // Finally, we copy the workgroup results back to global memory to
+          // be used in the next iteration. If this is the last iteration,
+          // i.e., there is only one workgroup also call final() if
+          // necessary.
+          if (local_id == 0) {
+            ValueOps::copy(
+                functor,
+                &results_ptr2[(item.get_group_linear_id()) * value_count],
+                &local_mem[0]);
+            if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+              if (n_wgroups <= 1)
+                FunctorFinal<FunctorType, WorkTag>::final(
+                    static_cast<const FunctorType&>(functor),
+                    &results_ptr2[(item.get_group_linear_id()) * value_count]);
+          }
+        });
+      });
+      m_space.fence();
+
+      // FIXME_SYCL this is likely not necessary, see above
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          m_space, results_ptr, results_ptr2,
+          sizeof(*m_result_ptr) * value_count * n_wgroups);
+      m_space.fence();
+
+      first_run = false;
+      size      = n_wgroups;
+    }
 
-    auto kernelFunctor = ExtendedReferenceWrapper<Functor>(*kernelFunctorPtr);
-    sycl_direct_launch(m_policy, kernelFunctor);
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          m_space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+      m_space.fence();
+    }
   }
 
  public:
-  void execute() const {
-    if (m_policy.begin() == m_policy.end()) {
-      const Kokkos::Experimental::SYCL& space = m_policy.space();
-      Kokkos::Experimental::Impl::SYCLInternal& instance =
-          *space.impl_internal_space_instance();
-      cl::sycl::queue& q = *instance.m_queue;
-
-      pointer_type result_ptr =
-          ReduceFunctorHasFinal<FunctorType>::value
-              ? static_cast<pointer_type>(sycl::malloc(
-                    sizeof(*m_result_ptr), q, sycl::usm::alloc::shared))
-              : m_result_ptr;
-
-      sycl::usm::alloc result_ptr_type =
-          sycl::get_pointer_type(result_ptr, q.get_context());
-
-      switch (result_ptr_type) {
-        case sycl::usm::alloc::host:
-        case sycl::usm::alloc::shared:
-          ValueInit::init(m_functor, result_ptr);
-          break;
-        case sycl::usm::alloc::device:
-          // non-USM-allocated memory
-        case sycl::usm::alloc::unknown: {
-          value_type host_result;
-          ValueInit::init(m_functor, &host_result);
-          q.memcpy(result_ptr, &host_result, sizeof(host_result)).wait();
-          break;
-        }
-        default: Kokkos::abort("pointer type outside of SYCL specs.");
-      }
-
-      if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
-        FunctorFinal<FunctorType, WorkTag>::final(m_functor, result_ptr);
-        sycl::free(result_ptr, q);
-      }
-
-      return;
-    }
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& policy, const Functor&) {
+    return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM;
+  }
 
-    if constexpr (std::is_trivially_copyable_v<decltype(m_functor)>)
-      sycl_direct_launch(m_policy, m_functor);
-    else
-      sycl_indirect_launch(m_functor);
+  void execute() const {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_space.impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem  = instance.m_indirectKernelMem;
+    IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_reducer, indirectReducerMem);
+
+    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
+                       reducer_wrapper.get_functor());
   }
 
  private:
   FunctorType m_functor;
-  Policy m_policy;
+  BarePolicy m_policy;
+  const Kokkos::Experimental::SYCL& m_space;
   ReducerType m_reducer;
   pointer_type m_result_ptr;
 };
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index 3595255b07648756d5aff44a000ae8b5078db5e0..5eac6bf9da62b29b9d15697bc5061c00db504e0c 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -83,21 +83,21 @@ class ParallelScanSYCLBase {
 
  private:
   template <typename Functor>
-  void scan_internal(cl::sycl::queue& q, const Functor& functor,
+  void scan_internal(sycl::queue& q, const Functor& functor,
                      pointer_type global_mem, std::size_t size) const {
     // FIXME_SYCL optimize
     constexpr size_t wgroup_size = 32;
     auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
 
     // FIXME_SYCL The allocation should be handled by the execution space
-    auto deleter = [&q](value_type* ptr) { cl::sycl::free(ptr, q); };
+    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
     std::unique_ptr<value_type[], decltype(deleter)> group_results_memory(
         static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups,
                                                q, sycl::usm::alloc::shared)),
         deleter);
     auto group_results = group_results_memory.get();
 
-    q.submit([&](cl::sycl::handler& cgh) {
+    q.submit([&](sycl::handler& cgh) {
       sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           local_mem(sycl::range<1>(wgroup_size), cgh);
@@ -159,7 +159,7 @@ class ParallelScanSYCLBase {
     });
 
     if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups);
-    q.wait();
+    m_policy.space().fence();
 
     q.submit([&](sycl::handler& cgh) {
       cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
@@ -171,7 +171,7 @@ class ParallelScanSYCLBase {
                                &group_results[item.get_group_linear_id()]);
                        });
     });
-    q.wait();
+    m_policy.space().fence();
   }
 
   template <typename Functor>
@@ -180,18 +180,17 @@ class ParallelScanSYCLBase {
     const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
-    cl::sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = *instance.m_queue;
 
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // Initialize global memory
     q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
-      auto policy     = m_policy;
+      auto begin      = m_policy.begin();
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
         const typename Policy::index_type id =
-            static_cast<typename Policy::index_type>(item.get_id()) +
-            policy.begin();
+            static_cast<typename Policy::index_type>(item.get_id()) + begin;
         value_type update{};
         ValueInit::init(functor, &update);
         if constexpr (std::is_same<WorkTag, void>::value)
@@ -201,7 +200,7 @@ class ParallelScanSYCLBase {
         ValueOps::copy(functor, &global_mem[id], &update);
       });
     });
-    q.wait();
+    space.fence();
 
     // Perform the actual exlcusive scan
     scan_internal(q, functor, m_scratch_space, len);
@@ -220,51 +219,36 @@ class ParallelScanSYCLBase {
         ValueOps::copy(functor, &global_mem[global_id], &update);
       });
     });
-    q.wait();
-  }
-
-  template <typename Functor>
-  void sycl_indirect_launch(const Functor& functor) const {
-    // Convenience references
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMemory& kernelMem =
-        *instance.m_indirectKernel;
-
-    // Allocate USM shared memory for the functor
-    kernelMem.resize(std::max(kernelMem.size(), sizeof(functor)));
-
-    // Placement new a copy of functor into USM shared memory
-    //
-    // Store it in a unique_ptr to call its destructor on scope exit
-    std::unique_ptr<Functor, Kokkos::Impl::destruct_delete> kernelFunctorPtr(
-        new (kernelMem.data()) Functor(functor));
-
-    auto kernelFunctor = std::reference_wrapper(*kernelFunctorPtr);
-    sycl_direct_launch(kernelFunctor);
+    space.fence();
   }
 
  public:
   template <typename PostFunctor>
   void impl_execute(const PostFunctor& post_functor) {
-    const auto& q = *(m_policy.space().impl_internal_space_instance()->m_queue);
+    if (m_policy.begin() == m_policy.end()) return;
+
+    const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue;
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // FIXME_SYCL The allocation should be handled by the execution space
     // consider only storing one value per block and recreate initial results in
     // the end before doing the final pass
-    auto deleter = [&q](value_type* ptr) { cl::sycl::free(ptr, q); };
+    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
     std::unique_ptr<value_type[], decltype(deleter)> result_memory(
         static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q,
                                                sycl::usm::alloc::shared)),
         deleter);
     m_scratch_space = result_memory.get();
 
-    if constexpr (std::is_trivially_copyable_v<decltype(m_functor)>)
-      sycl_direct_launch(m_policy, m_functor);
-    else
-      sycl_indirect_launch(m_functor);
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = m_policy.space()
+                                .impl_internal_space_instance()
+                                ->m_indirectKernelMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+
+    sycl_direct_launch(functor_wrapper.get_functor());
     post_functor();
   }
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..738620926b5496b9710ce001b77c6fb625325320
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -0,0 +1,835 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SYCL_PARALLEL_TEAM_HPP
+#define KOKKOS_SYCL_PARALLEL_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <SYCL/Kokkos_SYCL_Team.hpp>
+
+namespace Kokkos {
+namespace Impl {
+template <typename... Properties>
+class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  using execution_policy = TeamPolicyInternal;
+
+  using traits = PolicyTraits<Properties...>;
+
+  template <typename ExecSpace, typename... OtherProperties>
+  friend class TeamPolicyInternal;
+
+ private:
+  static int constexpr MAX_WARP = 8;
+
+  typename traits::execution_space m_space;
+  int m_league_size;
+  int m_team_size;
+  int m_vector_length;
+  int m_team_scratch_size[2];
+  int m_thread_scratch_size[2];
+  int m_chunk_size;
+  bool m_tune_team_size;
+  bool m_tune_vector_length;
+
+ public:
+  using execution_space = Kokkos::Experimental::SYCL;
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
+    m_league_size            = p.m_league_size;
+    m_team_size              = p.m_team_size;
+    m_vector_length          = p.m_vector_length;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+    m_space                  = p.m_space;
+    m_tune_team_size         = p.m_tune_team_size;
+    m_tune_vector_length     = p.m_tune_vector_length;
+  }
+
+  template <typename FunctorType>
+  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
+    return internal_team_size_max_for(f);
+  }
+
+  template <class FunctorType>
+  inline int team_size_max(const FunctorType& f,
+                           const ParallelReduceTag&) const {
+    return internal_team_size_max_reduce(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+                           const ParallelReduceTag&) const {
+    return internal_team_size_max_reduce(f);
+  }
+
+  template <typename FunctorType>
+  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
+    return internal_team_size_max_for(f);
+  }
+
+  template <typename FunctorType>
+  inline int team_size_recommended(FunctorType const& f,
+                                   ParallelReduceTag const&) const {
+    return internal_team_size_recommended_reduce(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  int team_size_recommended(FunctorType const& f, ReducerType const&,
+                            ParallelReduceTag const&) const {
+    return internal_team_size_recommended_reduce(f);
+  }
+  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
+  inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  static int vector_length_max() {
+    // FIXME_SYCL provide a reasonable value
+    return 1;
+  }
+
+  static int verify_requested_vector_length(int requested_vector_length) {
+    int test_vector_length =
+        std::min(requested_vector_length, vector_length_max());
+
+    // Allow only power-of-two vector_length
+    if (!(is_integral_power_of_two(test_vector_length))) {
+      int test_pow2 = 1;
+      for (int i = 0; i < 5; i++) {
+        test_pow2 = test_pow2 << 1;
+        if (test_pow2 > test_vector_length) {
+          break;
+        }
+      }
+      test_vector_length = test_pow2 >> 1;
+    }
+
+    return test_vector_length;
+  }
+
+  static int scratch_size_max(int level) {
+    return level == 0 ? 1024 * 32
+                      :           // FIXME_SYCL arbitrarily setting this to 32kB
+               20 * 1024 * 1024;  // FIXME_SYCL arbitrarily setting this to 20MB
+  }
+  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
+  inline void impl_set_team_size(size_t size) { m_team_size = size; }
+  int impl_vector_length() const { return m_vector_length; }
+  KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+
+  int team_size() const { return m_team_size; }
+
+  int league_size() const { return m_league_size; }
+
+  int scratch_size(int level, int team_size_ = -1) const {
+    if (team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
+  }
+
+  int team_scratch_size(int level) const { return m_team_scratch_size[level]; }
+
+  int thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  typename traits::execution_space space() const { return m_space; }
+
+  TeamPolicyInternal()
+      : m_space(typename traits::execution_space()),
+        m_league_size(0),
+        m_team_size(-1),
+        m_vector_length(0),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0),
+        m_tune_team_size(false),
+        m_tune_vector_length(false) {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request, int vector_length_request = 1)
+      : m_space(space_),
+        m_league_size(league_size_),
+        m_team_size(team_size_request),
+        m_vector_length(
+            (vector_length_request > 0)
+                ? verify_requested_vector_length(vector_length_request)
+                : (verify_requested_vector_length(1))),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(0),
+        m_tune_team_size(bool(team_size_request <= 0)),
+        m_tune_vector_length(bool(vector_length_request <= 0)) {
+    // FIXME_SYCL check paramters
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
+  // FLAG
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(space_, league_size_, -1, -1)
+
+  {}
+
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           vector_length_request) {}
+
+  /** \brief  Specify league size and team size, request vector length*/
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, -1)
+
+  {}
+
+  /** \brief  Specify league size, request team size and vector length*/
+  TeamPolicyInternal(int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+
+                     )
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
+                           -1) {}
+
+  int chunk_size() const { return m_chunk_size; }
+
+  TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerTeamValue const& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level,
+                                       PerThreadValue const& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
+                                       PerThreadValue const& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  using member_type = Kokkos::Impl::SYCLTeamMember;
+
+ protected:
+  template <class FunctorType>
+  int internal_team_size_max_for(const FunctorType& /*f*/) const {
+    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
+    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
+    // total:
+    // 2*sizeof(double)+m_team_scratch_size[0]
+    // + m_team_size(sizeof(double)+m_thread_scratch_size[0])
+    const int max_threads_for_memory =
+        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
+         2 * sizeof(double) - m_team_scratch_size[0]) /
+        (sizeof(double) + m_thread_scratch_size[0]);
+    return std::min<int>(
+        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+        max_threads_for_memory);
+  }
+
+  template <class FunctorType>
+  int internal_team_size_max_reduce(const FunctorType& f) const {
+    using Analysis        = FunctorAnalysis<FunctorPatternInterface::REDUCE,
+                                     TeamPolicyInternal, FunctorType>;
+    using value_type      = typename Analysis::value_type;
+    const int value_count = Analysis::value_count(f);
+
+    // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
+    // reducer_memsize = sizeof(value_type) * m_team_size * value_count
+    // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
+    // total:
+    // 2*sizeof(double)+m_team_scratch_size[0]
+    // + m_team_size(sizeof(double)+sizeof(value_type)*value_count
+    //               +m_thread_scratch_size[0])
+    const int max_threads_for_memory =
+        (space().impl_internal_space_instance()->m_maxShmemPerBlock -
+         2 * sizeof(double) - m_team_scratch_size[0]) /
+        (sizeof(double) + sizeof(value_type) * value_count +
+         m_thread_scratch_size[0]);
+    return std::min<int>(
+        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+        max_threads_for_memory);
+  }
+
+  template <class FunctorType>
+  int internal_team_size_recommended_for(const FunctorType& f) const {
+    // FIXME_SYCL improve
+    return internal_team_size_max_for(f);
+  }
+
+  template <class FunctorType>
+  int internal_team_size_recommended_reduce(const FunctorType& f) const {
+    // FIXME_SYCL improve
+    return internal_team_size_max_reduce(f);
+  }
+};
+
+template <typename FunctorType, typename... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
+  using functor_type = FunctorType;
+  using size_type    = ::Kokkos::Experimental::SYCL::size_type;
+
+ private:
+  using member_type   = typename Policy::member_type;
+  using work_tag      = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  FunctorType const m_functor;
+  Policy const m_policy;
+  size_type const m_league_size;
+  int m_team_size;
+  size_type const m_vector_size;
+  int m_shmem_begin;
+  int m_shmem_size;
+  void* m_scratch_ptr[2];
+  int m_scratch_size[2];
+
+  template <typename Functor>
+  void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+
+    q.submit([&](sycl::handler& cgh) {
+      // FIXME_SYCL accessors seem to need a size greater than zero at least for
+      // host queues
+      sycl::accessor<char, 1, sycl::access::mode::read_write,
+                     sycl::access::target::local>
+          team_scratch_memory_L0(
+              sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+              cgh);
+
+      // Avoid capturing *this since it might not be trivially copyable
+      const auto shmem_begin     = m_shmem_begin;
+      const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
+      void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
+
+      cgh.parallel_for(
+          sycl::nd_range<2>(
+              sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+              sycl::range<2>(m_team_size, m_vector_size)),
+          [=](sycl::nd_item<2> item) {
+            const member_type team_member(
+                team_scratch_memory_L0.get_pointer(), shmem_begin,
+                scratch_size[0],
+                static_cast<char*>(scratch_ptr[1]) +
+                    item.get_group(0) * scratch_size[1],
+                scratch_size[1], item);
+            if constexpr (std::is_same<work_tag, void>::value)
+              functor(team_member);
+            else
+              functor(work_tag(), team_member);
+          });
+    });
+    space.fence();
+  }
+
+ public:
+  inline void execute() const {
+    if (m_league_size == 0) return;
+
+    Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
+        indirectKernelMem = m_policy.space()
+                                .impl_internal_space_instance()
+                                ->m_indirectKernelMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+
+    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+  }
+
+  ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    // FIXME_SYCL optimize
+    if (m_team_size < 0) m_team_size = 32;
+
+    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size =
+        (m_policy.scratch_size(0, m_team_size) +
+         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+
+    // FIXME_SYCL so far accessors used instead of these pointers
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    const auto& space    = *m_policy.space().impl_internal_space_instance();
+    const sycl::queue& q = *space.m_queue;
+    m_scratch_ptr[0]     = nullptr;
+    m_scratch_ptr[1]     = sycl::malloc_device(
+        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+
+    if (static_cast<int>(space.m_maxShmemPerBlock) <
+        m_shmem_size - m_shmem_begin) {
+      std::stringstream out;
+      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
+             "Requested "
+          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
+          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
+          << '\n';
+      Kokkos::Impl::throw_runtime_exception(out.str());
+    }
+
+    if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
+      Kokkos::Impl::throw_runtime_exception(
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
+  }
+
+  // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
+  // instance
+  ParallelFor(const ParallelFor&) = delete;
+  ParallelFor& operator=(const ParallelFor&) = delete;
+
+  ~ParallelFor() {
+    const Kokkos::Experimental::SYCL& space = m_policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+    sycl::free(m_scratch_ptr[1], q);
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template <class FunctorType, class ReducerType, class... Properties>
+class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                     ReducerType, Kokkos::Experimental::SYCL> {
+ public:
+  using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
+
+ private:
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+  using member_type   = typename Policy::member_type;
+  using WorkTag       = typename Policy::work_tag;
+  using launch_bounds = typename Policy::launch_bounds;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
+
+ public:
+  using functor_type = FunctorType;
+  using size_type    = Kokkos::Experimental::SYCL::size_type;
+
+ private:
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  // FIXME_SYCL avoid reallocating memory for reductions
+  /*  size_type* m_scratch_space;
+    size_type* m_scratch_flags;
+    size_type m_team_begin;*/
+  size_type m_shmem_begin;
+  size_type m_shmem_size;
+  void* m_scratch_ptr[2];
+  int m_scratch_size[2];
+  const size_type m_league_size;
+  int m_team_size;
+  const size_type m_vector_size;
+
+  template <typename PolicyType, typename Functor, typename Reducer>
+  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
+                          const Reducer& reducer) const {
+    using ReducerConditional =
+        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                           FunctorType, ReducerType>;
+    using ReducerTypeFwd = typename ReducerConditional::type;
+    using WorkTagFwd =
+        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                           WorkTag, void>;
+    using ValueInit =
+        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+    using ValueJoin =
+        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
+
+    auto selected_reducer = ReducerConditional::select(functor, reducer);
+
+    // Convenience references
+    const Kokkos::Experimental::SYCL& space = policy.space();
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *space.impl_internal_space_instance();
+    sycl::queue& q = *instance.m_queue;
+
+    // FIXME_SYCL optimize
+    const size_t wgroup_size = m_team_size;
+    std::size_t size         = m_league_size * m_team_size;
+    const auto init_size =
+        std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
+    const unsigned int value_count =
+        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
+            selected_reducer);
+    // FIXME_SYCL only use the first half
+    const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
+        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
+    // FIXME_SYCL without this we are running into a race condition
+    const auto results_ptr2 =
+        results_ptr + std::max(value_count, 1u) * init_size;
+
+    // If size<=1 we only call init(), the functor and possibly final once
+    // working with the global scratch memory but don't copy back to
+    // m_result_ptr yet.
+    if (size <= 1) {
+      q.submit([&](sycl::handler& cgh) {
+        // FIXME_SYCL accessors seem to need a size greater than zero at least
+        // for host queues
+        sycl::accessor<char, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            team_scratch_memory_L0(
+                sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+                cgh);
+
+        // Avoid capturing *this since it might not be trivially copyable
+        const auto shmem_begin     = m_shmem_begin;
+        const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
+        void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
+
+        cgh.parallel_for(
+            sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)),
+            [=](sycl::nd_item<2> item) {
+              const auto& selected_reducer = ReducerConditional::select(
+                  static_cast<const FunctorType&>(functor),
+                  static_cast<const ReducerType&>(reducer));
+              reference_type update =
+                  ValueInit::init(selected_reducer, results_ptr);
+              if (size == 1) {
+                const member_type team_member(
+                    team_scratch_memory_L0.get_pointer(), shmem_begin,
+                    scratch_size[0], static_cast<char*>(scratch_ptr[1]),
+                    scratch_size[1], item);
+                if constexpr (std::is_same<WorkTag, void>::value)
+                  functor(team_member, update);
+                else
+                  functor(WorkTag(), team_member, update);
+              }
+              if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+                FunctorFinal<FunctorType, WorkTag>::final(
+                    static_cast<const FunctorType&>(functor), results_ptr);
+            });
+      });
+      space.fence();
+    }
+
+    // Otherwise, we perform a reduction on the values in all workgroups
+    // separately, write the workgroup results back to global memory and recurse
+    // until only one workgroup does the reduction and thus gets the final
+    // value.
+    bool first_run = true;
+    while (size > 1) {
+      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
+      q.submit([&](sycl::handler& cgh) {
+        sycl::accessor<value_type, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
+                      cgh);
+        // FIXME_SYCL accessors seem to need a size greater than zero at least
+        // for host queues
+        sycl::accessor<char, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            team_scratch_memory_L0(
+                sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+                cgh);
+
+        // Avoid capturing *this since it might not be trivially copyable
+        const auto shmem_begin     = m_shmem_begin;
+        const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
+        void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
+
+        cgh.parallel_for(
+            sycl::nd_range<2>(
+                sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+                sycl::range<2>(m_team_size, m_vector_size)),
+            [=](sycl::nd_item<2> item) {
+              const auto local_id = item.get_local_linear_id();
+              const auto global_id =
+                  wgroup_size * item.get_group_linear_id() + local_id;
+              const auto& selected_reducer = ReducerConditional::select(
+                  static_cast<const FunctorType&>(functor),
+                  static_cast<const ReducerType&>(reducer));
+
+              // In the first iteration, we call functor to initialize the local
+              // memory. Otherwise, the local memory is initialized with the
+              // results from the previous iteration that are stored in global
+              // memory. Note that we load values_per_thread values per thread
+              // and immediately combine them to avoid too many threads being
+              // idle in the actual workgroup reduction.
+              if (first_run) {
+                reference_type update = ValueInit::init(
+                    selected_reducer, &local_mem[local_id * value_count]);
+                const member_type team_member(
+                    team_scratch_memory_L0.get_pointer(), shmem_begin,
+                    scratch_size[0],
+                    static_cast<char*>(scratch_ptr[1]) +
+                        item.get_group(0) * scratch_size[1],
+                    scratch_size[1], item);
+                if constexpr (std::is_same<WorkTag, void>::value)
+                  functor(team_member, update);
+                else
+                  functor(WorkTag(), team_member, update);
+              } else {
+                if (global_id >= size)
+                  ValueInit::init(selected_reducer,
+                                  &local_mem[local_id * value_count]);
+                else {
+                  ValueOps::copy(functor, &local_mem[local_id * value_count],
+                                 &results_ptr[global_id * value_count]);
+                }
+              }
+              item.barrier(sycl::access::fence_space::local_space);
+
+              // Perform the actual workgroup reduction. To achieve a better
+              // memory access pattern, we use sequential addressing and a
+              // reversed loop. If the workgroup size is 8, the first element
+              // contains all the values with index%4==0, after the second one
+              // the values with index%2==0 and after the third one index%1==0,
+              // i.e., all values.
+              for (unsigned int stride = wgroup_size / 2; stride > 0;
+                   stride >>= 1) {
+                const auto idx = local_id;
+                if (idx < stride) {
+                  ValueJoin::join(selected_reducer,
+                                  &local_mem[idx * value_count],
+                                  &local_mem[(idx + stride) * value_count]);
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+              }
+
+              // Finally, we copy the workgroup results back to global memory to
+              // be used in the next iteration. If this is the last iteration,
+              // i.e., there is only one workgroup also call final() if
+              // necessary.
+              if (local_id == 0) {
+                ValueOps::copy(
+                    functor,
+                    &results_ptr2[(item.get_group_linear_id()) * value_count],
+                    &local_mem[0]);
+                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+                  if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
+                    FunctorFinal<FunctorType, WorkTag>::final(
+                        static_cast<const FunctorType&>(functor),
+                        &results_ptr2[(item.get_group_linear_id()) *
+                                      value_count]);
+                  }
+              }
+            });
+      });
+      space.fence();
+
+      // FIXME_SYCL this is likely not necessary, see above
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, results_ptr, results_ptr2,
+          sizeof(*m_result_ptr) * value_count * n_wgroups);
+      space.fence();
+
+      first_run = false;
+      size      = n_wgroups;
+    }
+
+    // At this point, the reduced value is written to the entry in results_ptr
+    // and all that is left is to copy it back to the given result pointer if
+    // necessary.
+    if (m_result_ptr) {
+      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
+          space, m_result_ptr, results_ptr,
+          sizeof(*m_result_ptr) * value_count);
+      space.fence();
+    }
+  }
+
+ public:
+  inline void execute() {
+    Kokkos::Experimental::Impl::SYCLInternal& instance =
+        *m_policy.space().impl_internal_space_instance();
+    using IndirectKernelMem =
+        Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
+    IndirectKernelMem& indirectKernelMem  = instance.m_indirectKernelMem;
+    IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem;
+
+    const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_functor, indirectKernelMem);
+    const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
+        m_reducer, indirectReducerMem);
+
+    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
+                       reducer_wrapper.get_functor());
+  }
+
+ private:
+  void initialize() {
+    // FIXME_SYCL optimize
+    if (m_team_size < 0) m_team_size = 32;
+    // Must be a power of two greater than two, get the one not bigger than the
+    // requested one.
+    if ((m_team_size & m_team_size - 1) || m_team_size < 2) {
+      int temp_team_size = 2;
+      while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1;
+      m_team_size = temp_team_size;
+    }
+
+    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size =
+        (m_policy.scratch_size(0, m_team_size) +
+         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+
+    // FIXME_SYCL so far accessors used instead of these pointers
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    const auto& space    = *m_policy.space().impl_internal_space_instance();
+    const sycl::queue& q = *space.m_queue;
+    m_scratch_ptr[0]     = nullptr;
+    m_scratch_ptr[1]     = sycl::malloc_device(
+        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+
+    if (static_cast<int>(space.m_maxShmemPerBlock) <
+        m_shmem_size - m_shmem_begin) {
+      std::stringstream out;
+      out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
+             "Requested "
+          << m_shmem_size - m_shmem_begin << " bytes but maximum is "
+          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
+          << '\n';
+      Kokkos::Impl::throw_runtime_exception(out.str());
+    }
+
+    if (m_team_size > m_policy.team_size_max(m_functor, ParallelForTag{}))
+      Kokkos::Impl::throw_runtime_exception(
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
+  }
+
+ public:
+  template <class ViewType>
+  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
+                 ViewType const& arg_result,
+                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
+                                         void*>::type = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    initialize();
+  }
+
+  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
+                 ReducerType const& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    initialize();
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index fd6f11d58c89f91410f448aee3f9af2db1cd9c46..75741438e295c543db2737e6943ea52e244d69db 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -42,6 +42,8 @@
 //@HEADER
 */
 
+#include <Kokkos_Macros.hpp>
+
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_SYCL.hpp>
 #include <Kokkos_SYCL_Space.hpp>
@@ -55,7 +57,7 @@
 namespace Kokkos {
 namespace Impl {
 namespace {
-auto USM_memcpy(cl::sycl::queue& q, void* dst, const void* src, size_t n) {
+auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) {
   return q.memcpy(dst, src, n);
 }
 
@@ -65,10 +67,10 @@ void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst,
 }
 
 void USM_memcpy(void* dst, const void* src, size_t n) {
-  Kokkos::Experimental::Impl::SYCLInternal::singleton().m_queue->wait();
-  USM_memcpy(*Kokkos::Experimental::Impl::SYCLInternal::singleton().m_queue,
-             dst, src, n)
-      .wait();
+  Experimental::SYCL().fence();
+  auto event = USM_memcpy(
+      *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n);
+  Experimental::Impl::SYCLInternal::fence(event);
 }
 }  // namespace
 
@@ -123,29 +125,27 @@ DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
 namespace Kokkos {
 namespace Experimental {
 
-SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_device(SYCL().sycl_device()) {}
+SYCLDeviceUSMSpace::SYCLDeviceUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLDeviceUSMSpace::SYCLDeviceUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
 
-void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const {
-  return allocate("[unlabeled]", arg_alloc_size);
-}
-void* SYCLDeviceUSMSpace::allocate(const char* arg_label,
-                                   const size_t arg_alloc_size,
-                                   const size_t arg_logical_size) const {
-  return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
-}
+SYCLSharedUSMSpace::SYCLSharedUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
 
-void* SYCLDeviceUSMSpace::impl_allocate(
+void* allocate_sycl(
     const char* arg_label, const size_t arg_alloc_size,
-    const size_t arg_logical_size,
-    const Kokkos::Tools::SpaceHandle arg_handle) const {
-  const cl::sycl::queue& queue =
-      *SYCL().impl_internal_space_instance()->m_queue;
-  void* const hostPtr = cl::sycl::malloc_device(arg_alloc_size, queue);
+    const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle,
+    const RawMemoryAllocationFailure::AllocationMechanism failure_tag,
+    const sycl::usm::alloc allocation_kind, const sycl::queue& queue) {
+  void* const hostPtr = sycl::malloc(arg_alloc_size, queue, allocation_kind);
 
   if (hostPtr == nullptr)
     throw RawMemoryAllocationFailure(
         arg_alloc_size, 1, RawMemoryAllocationFailure::FailureMode::Unknown,
-        RawMemoryAllocationFailure::AllocationMechanism::SYCLMalloc);
+        failure_tag);
 
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
@@ -157,6 +157,47 @@ void* SYCLDeviceUSMSpace::impl_allocate(
   return hostPtr;
 }
 
+void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+
+void* SYCLDeviceUSMSpace::allocate(const char* arg_label,
+                                   const size_t arg_alloc_size,
+                                   const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice,
+      sycl::usm::alloc::device, m_queue);
+}
+
+void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* SYCLSharedUSMSpace::allocate(const char* arg_label,
+                                   const size_t arg_alloc_size,
+                                   const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared,
+      sycl::usm::alloc::shared, m_queue);
+}
+
+void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                     const size_t arg_alloc_size, const size_t arg_logical_size,
+                     const Kokkos::Tools::SpaceHandle arg_handle,
+                     const sycl::queue& queue) {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
+                                      reported_size);
+  }
+
+  sycl::free(arg_alloc_ptr, queue);
+}
+
 void SYCLDeviceUSMSpace::deallocate(void* const arg_alloc_ptr,
                                     const size_t arg_alloc_size) const {
   deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
@@ -165,21 +206,21 @@ void SYCLDeviceUSMSpace::deallocate(const char* arg_label,
                                     void* const arg_alloc_ptr,
                                     const size_t arg_alloc_size,
                                     const size_t arg_logical_size) const {
-  impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
 }
-void SYCLDeviceUSMSpace::impl_deallocate(
-    const char* arg_label, void* const arg_alloc_ptr,
-    const size_t arg_alloc_size, const size_t arg_logical_size,
-    const Kokkos::Tools::SpaceHandle arg_handle) const {
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    const size_t reported_size =
-        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
-    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
-                                      reported_size);
-  }
-  const cl::sycl::queue& queue =
-      *SYCL().impl_internal_space_instance()->m_queue;
-  cl::sycl::free(arg_alloc_ptr, queue);
+
+void SYCLSharedUSMSpace::deallocate(void* const arg_alloc_ptr,
+                                    const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void SYCLSharedUSMSpace::deallocate(const char* arg_label,
+                                    void* const arg_alloc_ptr,
+                                    const size_t arg_alloc_size,
+                                    const size_t arg_logical_size) const {
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
 }
 
 }  // namespace Experimental
@@ -191,6 +232,9 @@ namespace Impl {
 #ifdef KOKKOS_ENABLE_DEBUG
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
@@ -200,7 +244,7 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
         const SharedAllocationRecord<void, void>::function_type dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
                                   void>::s_root_record,
@@ -208,27 +252,36 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
           Kokkos::Impl::checked_allocation_with_header(space, label, size),
           sizeof(SharedAllocationHeader) + size, dealloc),
       m_space(space) {
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::allocateData(
-        Kokkos::Profiling::make_space_handle(space.name()), label, data(),
-        size);
-  }
-
   SharedAllocationHeader header;
 
-  // Fill in the Header information
-  header.m_record = static_cast<SharedAllocationRecord<void, void>*>(this);
-
-  strncpy(header.m_label, label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+  this->base_t::_fill_host_accessible_header_info(header, label);
 
   // Copy to device memory
   Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>(
       RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -238,201 +291,57 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
 namespace Kokkos {
 namespace Impl {
 
-std::string SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                                   void>::get_label() const {
-  SharedAllocationHeader header;
-
-  Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
-                         Kokkos::Experimental::SYCLDeviceUSMSpace>(
-      &header, RecordBase::head(), sizeof(SharedAllocationHeader));
-
-  return std::string(header.m_label);
-}
-
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    allocate(const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-             const std::string& arg_label, const size_t arg_alloc_size) {
-  return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
-}
-
-void SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    deallocate(SharedAllocationRecord<void, void>* arg_rec) {
-  delete static_cast<SharedAllocationRecord*>(arg_rec);
-}
-
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
                        void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     SharedAllocationHeader header;
     Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                            Kokkos::HostSpace>(&header, RecordBase::m_alloc_ptr,
                                               sizeof(SharedAllocationHeader));
-
-    Kokkos::Profiling::deallocateData(
-        Kokkos::Profiling::make_space_handle(
-            Kokkos::Experimental::SYCLDeviceUSMSpace::name()),
-        header.m_label, data(), size());
+    label = header.label();
   }
-
-  m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
-}
-
-//----------------------------------------------------------------------------
-
-void* SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    allocate_tracked(const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
-                     const std::string& arg_alloc_label,
-                     const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord* const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
-void SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                            void>::deallocate_tracked(void* const
-                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord* const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
+SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
+                       void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    label = RecordBase::m_alloc_ptr->m_label;
   }
-}
-
-void* SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord* const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                         Kokkos::Experimental::SYCLDeviceUSMSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
 //----------------------------------------------------------------------------
 
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>*
-SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                       void>::get_record(void* alloc_ptr) {
-  using Header = SharedAllocationHeader;
-  using RecordSYCL =
-      SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
-
-  // Copy the header from the allocation
-  Header head;
-
-  Header const* const head_sycl =
-      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
+}  // namespace Impl
+}  // namespace Kokkos
 
-  if (alloc_ptr) {
-    Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
-                           Kokkos::Experimental::SYCLDeviceUSMSpace>(
-        &head, head_sycl, sizeof(SharedAllocationHeader));
-  }
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
 
-  RecordSYCL* const record =
-      alloc_ptr ? static_cast<RecordSYCL*>(head.m_record) : nullptr;
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
-  if (!alloc_ptr || record->m_alloc_ptr != head_sycl) {
-    Kokkos::Impl::throw_runtime_exception(
-        std::string("Kokkos::Impl::SharedAllocationRecord< "
-                    "Kokkos::Experimental::SYCLDeviceUSMSpace "
-                    ", void >::get_record ERROR"));
-  }
-
-  return record;
-}
+namespace Kokkos {
+namespace Impl {
 
-// Iterate records to print orphaned memory ...
-void SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
-    print_records(std::ostream& s,
-                  const Kokkos::Experimental::SYCLDeviceUSMSpace&,
-                  bool detail) {
-#ifdef KOKKOS_ENABLE_DEBUG
-  SharedAllocationRecord<void, void>* r = &s_root_record;
-
-  char buffer[256];
-
-  SharedAllocationHeader head;
-
-  if (detail) {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
-                               Kokkos::Experimental::SYCLDeviceUSMSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-      } else {
-        head.m_label[0] = 0;
-      }
-
-      // Formatting dependent on sizeof(uintptr_t)
-      const char* format_string;
-
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-        format_string =
-            "SYCL addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx "
-            "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
-      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-        format_string =
-            "SYCL addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ "
-            "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
-      }
-
-      snprintf(buffer, 256, format_string, reinterpret_cast<uintptr_t>(r),
-               reinterpret_cast<uintptr_t>(r->m_prev),
-               reinterpret_cast<uintptr_t>(r->m_next),
-               reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size,
-               r->m_count, reinterpret_cast<uintptr_t>(r->m_dealloc),
-               head.m_label);
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  } else {
-    do {
-      if (r->m_alloc_ptr) {
-        Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
-                               Kokkos::Experimental::SYCLDeviceUSMSpace>(
-            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
-
-        // Formatting dependent on sizeof(uintptr_t)
-        const char* format_string;
-
-        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
-          format_string = "SYCL [ 0x%.12lx + %ld ] %s\n";
-        } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
-          format_string = "SYCL [ 0x%.12llx + %ld ] %s\n";
-        }
-
-        snprintf(buffer, 256, format_string,
-                 reinterpret_cast<uintptr_t>(r->data()), r->size(),
-                 head.m_label);
-      } else {
-        snprintf(buffer, 256, "SYCL [ 0 + 0 ]\n");
-      }
-      s << buffer;
-      r = r->m_next;
-    } while (r != &s_root_record);
-  }
-#else
-  (void)s;
-  (void)detail;
-  throw_runtime_exception(
-      "Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>::print_records"
-      " only works with KOKKOS_ENABLE_DEBUG enabled");
-#endif
-}
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicity instantiate these CRTP base classes here,
+// where we have access to the associated *_timpl.hpp header files.
+template class HostInaccessibleSharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLDeviceUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLDeviceUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLSharedUSMSpace>;
 
 }  // namespace Impl
 }  // namespace Kokkos
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a30cf2109a60ccc5934bfc6ee834a831c539d485
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -0,0 +1,816 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SYCL_TEAM_HPP
+#define KOKKOS_SYCL_TEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_SYCL
+
+#include <utility>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Team member_type passed to TeamPolicy or TeamTask closures.
+ */
+class SYCLTeamMember {
+ public:
+  using execution_space      = Kokkos::Experimental::SYCL;
+  using scratch_memory_space = execution_space::scratch_memory_space;
+
+ private:
+  mutable void* m_team_reduce;
+  scratch_memory_space m_team_shared;
+  int m_team_reduce_size;
+  sycl::nd_item<2> m_item;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_shmem() const {
+    return m_team_shared.set_team_thread_mode(0, 1, 0);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_scratch(
+      const int level) const {
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& thread_scratch(
+      const int level) const {
+    return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
+  }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const {
+    return m_item.get_group_linear_id();
+  }
+  KOKKOS_INLINE_FUNCTION int league_size() const {
+    // FIXME_SYCL needs to be revised for vector_length>1.
+    return m_item.get_group_range(0);
+  }
+  KOKKOS_INLINE_FUNCTION int team_rank() const {
+    return m_item.get_local_linear_id();
+  }
+  KOKKOS_INLINE_FUNCTION int team_size() const {
+    // FIXME_SYCL needs to be revised for vector_length>1.
+    return m_item.get_local_range(0);
+  }
+  KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); }
+
+  KOKKOS_INLINE_FUNCTION const sycl::nd_item<2>& item() const { return m_item; }
+
+  //--------------------------------------------------------------------------
+
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val,
+                                             const int thread_id) const {
+    // Wait for shared data write until all threads arrive here
+    m_item.barrier(sycl::access::fence_space::local_space);
+    if (m_item.get_local_id(1) == 0 &&
+        static_cast<int>(m_item.get_local_id(0)) == thread_id) {
+      *static_cast<ValueType*>(m_team_reduce) = val;
+    }
+    // Wait for shared data read until root thread writes
+    m_item.barrier(sycl::access::fence_space::local_space);
+    val = *static_cast<ValueType*>(m_team_reduce);
+  }
+
+  template <class Closure, class ValueType>
+  KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, ValueType& val,
+                                             const int thread_id) const {
+    f(val);
+    team_broadcast(val, thread_id);
+  }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Reduction across a team
+   */
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION
+      typename std::enable_if<is_reducer<ReducerType>::value>::type
+      team_reduce(ReducerType const& reducer) const noexcept {
+    team_reduce(reducer, reducer.reference());
+  }
+
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION
+      typename std::enable_if<is_reducer<ReducerType>::value>::type
+      team_reduce(ReducerType const& reducer,
+                  typename ReducerType::value_type& value) const noexcept {
+    using value_type = typename ReducerType::value_type;
+
+    // We need to chunk up the whole reduction because we might not have
+    // allocated enough memory.
+    const int maximum_work_range =
+        std::min<int>(m_team_reduce_size / sizeof(value_type), team_size());
+
+    int smaller_power_of_two = 1;
+    while ((smaller_power_of_two << 1) < maximum_work_range)
+      smaller_power_of_two <<= 1;
+
+    const int idx        = team_rank();
+    auto reduction_array = static_cast<value_type*>(m_team_reduce);
+
+    // Load values into the first maximum_work_range values of the reduction
+    // array in chunks. This means that only threads with an id in the
+    // corresponding chunk load values and the reduction is always done by the
+    // first smaller_power_of_two threads.
+    if (idx < maximum_work_range) reduction_array[idx] = value;
+    m_item.barrier(sycl::access::fence_space::local_space);
+
+    for (int start = maximum_work_range; start < team_size();
+         start += maximum_work_range) {
+      if (idx >= start &&
+          idx < std::min(start + maximum_work_range, team_size()))
+        reducer.join(reduction_array[idx - start], value);
+      m_item.barrier(sycl::access::fence_space::local_space);
+    }
+
+    for (int stride = smaller_power_of_two; stride > 0; stride >>= 1) {
+      if (idx < stride && idx + stride < maximum_work_range)
+        reducer.join(reduction_array[idx], reduction_array[idx + stride]);
+      m_item.barrier(sycl::access::fence_space::local_space);
+    }
+    reducer.reference() = reduction_array[0];
+    m_item.barrier(sycl::access::fence_space::local_space);
+  }
+
+  // FIXME_SYCL move somewhere else and combine with other places that do
+  // parallel_scan
+  // Exclusive scan returning the total sum.
+  // n is required to be a power of two and
+  // temp must point to an array containing the data to be processed
+  // The accumulated value is returned.
+  template <typename Type>
+  static Type prescan(sycl::nd_item<2> m_item, Type* temp, int n) {
+    int thid = m_item.get_local_id(0);
+
+    // First do a reduction saving intermediate results
+    for (int stride = 1; stride < n; stride <<= 1) {
+      auto idx = 2 * stride * (thid + 1) - 1;
+      if (idx < n) temp[idx] += temp[idx - stride];
+      m_item.barrier(sycl::access::fence_space::local_space);
+    }
+
+    Type total_sum = temp[n - 1];
+    m_item.barrier(sycl::access::fence_space::local_space);
+
+    // clear the last element so we get an exclusive scan
+    if (thid == 0) temp[n - 1] = Type{};
+    m_item.barrier(sycl::access::fence_space::local_space);
+
+    // Now add the intermediate results to the remaining items again
+    for (int stride = n / 2; stride > 0; stride >>= 1) {
+      auto idx = 2 * stride * (thid + 1) - 1;
+      if (idx < n) {
+        Type dummy         = temp[idx - stride];
+        temp[idx - stride] = temp[idx];
+        temp[idx] += dummy;
+      }
+      m_item.barrier(sycl::access::fence_space::local_space);
+    }
+
+    return total_sum;
+  }
+
+  //--------------------------------------------------------------------------
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template <typename Type>
+  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value,
+                                        Type* const global_accum) const {
+    // We need to chunk up the whole reduction because we might not have
+    // allocated enough memory.
+    const int maximum_work_range =
+        std::min<int>(m_team_reduce_size / sizeof(Type), team_size());
+
+    int not_greater_power_of_two = 1;
+    while ((not_greater_power_of_two << 1) < maximum_work_range + 1)
+      not_greater_power_of_two <<= 1;
+
+    Type intermediate;
+    Type total{};
+
+    const int idx        = team_rank();
+    const auto base_data = static_cast<Type*>(m_team_reduce);
+
+    // Load values into the first not_greater_power_of_two values of the
+    // reduction array in chunks. This means that only threads with an id in the
+    // corresponding chunk load values and the reduction is always done by the
+    // first not_greater_power_of_two threads.
+    for (int start = 0; start < team_size();
+         start += not_greater_power_of_two) {
+      m_item.barrier(sycl::access::fence_space::local_space);
+      if (idx >= start && idx < start + not_greater_power_of_two) {
+        base_data[idx - start] = value;
+      }
+      m_item.barrier(sycl::access::fence_space::local_space);
+
+      const Type partial_total =
+          prescan(m_item, base_data, not_greater_power_of_two);
+      if (idx >= start && idx < start + not_greater_power_of_two)
+        intermediate = base_data[idx - start] + total;
+      if (start == 0)
+        total = partial_total;
+      else
+        total += partial_total;
+    }
+
+    if (global_accum) {
+      if (team_size() == idx + 1) {
+        base_data[team_size()] = atomic_fetch_add(global_accum, total);
+      }
+      m_item.barrier();  // Wait for atomic
+      intermediate += base_data[team_size()];
+    }
+
+    return intermediate;
+  }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template <typename Type>
+  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const {
+    return this->template team_scan<Type>(value, nullptr);
+  }
+
+  //----------------------------------------
+
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION static
+      typename std::enable_if<is_reducer<ReducerType>::value>::type
+      vector_reduce(ReducerType const& reducer) {
+    vector_reduce(reducer, reducer.reference());
+  }
+
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION static
+      typename std::enable_if<is_reducer<ReducerType>::value>::type
+      vector_reduce(ReducerType const& /*reducer*/,
+                    typename ReducerType::value_type& /*value*/) {
+    // FIXME_SYCL
+    Kokkos::abort("Not implemented!");
+  }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Global reduction across all blocks
+   *
+   *  Return !0 if reducer contains the final value
+   */
+  template <typename ReducerType>
+  KOKKOS_INLINE_FUNCTION static
+      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
+      global_reduce(ReducerType const& /*reducer*/,
+                    int* const /*global_scratch_flags*/,
+                    void* const /*global_scratch_space*/, void* const /*shmem*/,
+                    int const /*shmem_size*/) {
+    // FIXME_SYCL
+    Kokkos::abort("Not implemented!");
+  }
+
+  //----------------------------------------
+  // Private for the driver
+
+  KOKKOS_INLINE_FUNCTION
+  SYCLTeamMember(void* shared, const int shared_begin, const int shared_size,
+                 void* scratch_level_1_ptr, const int scratch_level_1_size,
+                 const sycl::nd_item<2> item)
+      : m_team_reduce(shared),
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
+                      scratch_level_1_ptr, scratch_level_1_size),
+        m_team_reduce_size(shared_begin),
+        m_item(item) {}
+
+ public:
+  // Declare to avoid unused private member warnings which are trigger
+  // when SFINAE excludes the member function which uses these variables
+  // Making another class a friend also surpresses these warnings
+  bool impl_avoid_sfinae_warning() const noexcept {
+    return m_team_reduce_size > 0 && m_team_reduce != nullptr;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template <typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, SYCLTeamMember> {
+  using index_type = iType;
+  const SYCLTeamMember& member;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct(const SYCLTeamMember& thread_, iType count)
+      : member(thread_), start(0), end(count) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct(const SYCLTeamMember& thread_, iType begin_,
+                                  iType end_)
+      : member(thread_), start(begin_), end(end_) {}
+};
+
+template <typename iType>
+struct TeamVectorRangeBoundariesStruct<iType, SYCLTeamMember> {
+  using index_type = iType;
+  const SYCLTeamMember& member;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamVectorRangeBoundariesStruct(const SYCLTeamMember& thread_,
+                                  const iType& count)
+      : member(thread_), start(0), end(count) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamVectorRangeBoundariesStruct(const SYCLTeamMember& thread_,
+                                  const iType& begin_, const iType& end_)
+      : member(thread_), start(begin_), end(end_) {}
+};
+
+template <typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType, SYCLTeamMember> {
+  using index_type = iType;
+  const SYCLTeamMember& member;
+  const index_type start;
+  const index_type end;
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct(const SYCLTeamMember& thread,
+                                    index_type count)
+      : member(thread), start(static_cast<index_type>(0)), end(count) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct(const SYCLTeamMember& thread,
+                                    index_type arg_begin, index_type arg_end)
+      : member(thread), start(arg_begin), end(arg_end) {}
+};
+
+}  // namespace Impl
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>
+    TeamThreadRange(const Impl::SYCLTeamMember& thread, iType count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+TeamThreadRange(const Impl::SYCLTeamMember& thread, iType1 begin, iType2 end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, iType(begin), iType(end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>
+    TeamVectorRange(const Impl::SYCLTeamMember& thread, const iType& count) {
+  return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+TeamVectorRange(const Impl::SYCLTeamMember& thread, const iType1& begin,
+                const iType2& end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
+  return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, iType(begin), iType(end));
+}
+
+template <typename iType>
+KOKKOS_INLINE_FUNCTION
+    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>
+    ThreadVectorRange(const Impl::SYCLTeamMember& thread, iType count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, count);
+}
+
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+ThreadVectorRange(const Impl::SYCLTeamMember& thread, iType1 arg_begin,
+                  iType2 arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
+  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
+      thread, iType(arg_begin), iType(arg_end));
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::SYCLTeamMember> PerTeam(
+    const Impl::SYCLTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::SYCLTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::SYCLTeamMember> PerThread(
+    const Impl::SYCLTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::SYCLTeamMember>(thread);
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N).
+ *
+ * The range [0..N) is mapped to all threads of the calling thread team.
+ */
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  // FIXME_SYCL Fix for vector_length>1.
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0))
+    closure(i);
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_reduce with a reducer.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template <typename iType, class Closure, class ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember>& loop_boundaries,
+                    const Closure& closure, const ReducerType& reducer) {
+  typename ReducerType::value_type value;
+  reducer.init(value);
+
+  // FIXME_SYCL Fix for vector_length>1.
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0)) {
+    closure(i, value);
+  }
+
+  loop_boundaries.member.team_reduce(reducer, value);
+}
+
+/** \brief  Inter-thread parallel_reduce assuming summation.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
+    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember>& loop_boundaries,
+                    const Closure& closure, ValueType& result) {
+  ValueType val;
+  Kokkos::Sum<ValueType> reducer(val);
+
+  reducer.init(reducer.reference());
+
+  // FIXME_SYCL Fix for vector_length>1.
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0)) {
+    closure(i, val);
+  }
+
+  loop_boundaries.member.team_reduce(reducer, val);
+  result = reducer.reference();
+}
+
+/** \brief  Inter-thread parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to each rank in the team (whose global rank is
+ *  less than N) and a scan operation is performed. The last call to closure has
+ *  final == true.
+ */
+// This is the same code as in CUDA and largely the same as in OpenMPTarget
+template <typename iType, typename FunctorType>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_bounds,
+    const FunctorType& lambda) {
+  // Extract value_type from lambda
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
+      FunctorType>::value_type;
+
+  const auto start     = loop_bounds.start;
+  const auto end       = loop_bounds.end;
+  auto& member         = loop_bounds.member;
+  const auto team_size = member.team_size();
+  const auto team_rank = member.team_rank();
+  const auto nchunk    = (end - start + team_size - 1) / team_size;
+  value_type accum     = 0;
+  // each team has to process one or more chunks of the prefix scan
+  for (iType i = 0; i < nchunk; ++i) {
+    auto ii = start + i * team_size + team_rank;
+    // local accumulation for this chunk
+    value_type local_accum = 0;
+    // user updates value with prefix value
+    if (ii < loop_bounds.end) lambda(ii, local_accum, false);
+    // perform team scan
+    local_accum = member.team_scan(local_accum);
+    // add this blocks accum to total accumulation
+    auto val = accum + local_accum;
+    // user updates their data with total accumulation
+    if (ii < loop_bounds.end) lambda(ii, val, true);
+    // the last value needs to be propogated to next chunk
+    if (team_rank == team_size - 1) accum = val;
+    // broadcast last value to rest of the team
+    member.team_broadcast(accum, team_size - 1);
+  }
+}
+
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  // FIXME_SYCL adapt for vector_length != 1
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0))
+    closure(i);
+}
+
+template <typename iType, class Closure, class ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember>& loop_boundaries,
+                    const Closure& closure, const ReducerType& reducer) {
+  // FIXME_SYCL adapt for vector_length != 1
+  typename ReducerType::value_type value;
+  reducer.init(value);
+
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0)) {
+    closure(i, value);
+  }
+
+  loop_boundaries.member.team_reduce(reducer, value);
+}
+
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
+    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember>& loop_boundaries,
+                    const Closure& closure, ValueType& result) {
+  // FIXME_SYCL adapt for vector_length != 1
+  ValueType val;
+  Kokkos::Sum<ValueType> reducer(val);
+
+  reducer.init(reducer.reference());
+
+  for (iType i = loop_boundaries.start +
+                 loop_boundaries.member.item().get_local_id(0);
+       i < loop_boundaries.end;
+       i += loop_boundaries.member.item().get_local_range(0)) {
+    closure(i, val);
+  }
+
+  loop_boundaries.member.team_reduce(reducer, val);
+  result = reducer.reference();
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all vector lanes of the calling thread.
+ */
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_for(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  // FIXME_SYC: adapt for vector_length!=1
+  for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i)
+    closure(i);
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template <typename iType, class Closure, class ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<is_reducer<ReducerType>::value>::type
+    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember> const& loop_boundaries,
+                    Closure const& closure, ReducerType const& reducer) {
+  // FIXME_SYCL adapt for vector_length != 1
+  reducer.init(reducer.reference());
+
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+    closure(i, reducer.reference());
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template <typename iType, class Closure, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<!is_reducer<ValueType>::value>::type
+    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                        iType, Impl::SYCLTeamMember> const& loop_boundaries,
+                    Closure const& closure, ValueType& result) {
+  // FIXME_SYCL adapt for vector_length != 1
+  result = ValueType();
+
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+    closure(i, result);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum with reducer.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                      iType, Impl::SYCLTeamMember>& loop_boundaries,
+                  const Closure& closure, const ReducerType& reducer) {
+  // FIXME_SYCL modify for vector_length!=1
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+
+  value_type accum;
+  reducer.init(accum);
+
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+    closure(i, accum, true);
+  }
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template <typename iType, class Closure>
+KOKKOS_INLINE_FUNCTION void parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
+        loop_boundaries,
+    const Closure& closure) {
+  using value_type = typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
+  value_type dummy;
+  parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>{dummy});
+}
+
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
+    const FunctorType& lambda) {
+  if (single_struct.team_member.item().get_local_id(1) == 0) lambda();
+}
+
+template <class FunctorType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
+    const FunctorType& lambda) {
+  if (single_struct.team_member.team_rank() == 0) lambda();
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
+    const FunctorType& lambda, ValueType& val) {
+  if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val);
+}
+
+template <class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION void single(
+    const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
+    const FunctorType& lambda, ValueType& val) {
+  if (single_struct.team_member.team_rank() == 0) lambda(val);
+}
+
+}  // namespace Kokkos
+
+#endif
+
+#endif /* #ifndef KOKKOS_SYCL_TEAM_HPP */
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..141a692f6090555cf129997a64bc9e99941f830d
--- /dev/null
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -0,0 +1,134 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SYCL_UNIQUE_TOKEN_HPP
+#define KOKKOS_SYCL_UNIQUE_TOKEN_HPP
+
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+#include <Kokkos_SYCL_Space.hpp>
+#include <Kokkos_UniqueToken.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template <>
+class UniqueToken<SYCL, UniqueTokenScope::Global> {
+ protected:
+  uint32_t volatile* m_buffer;
+  uint32_t m_count;
+
+ public:
+  using execution_space = SYCL;
+  using size_type       = int32_t;
+
+  explicit UniqueToken(execution_space const& = execution_space())
+      : m_buffer(Impl::SYCLInternal::singleton().m_scratchConcurrentBitset),
+        m_count(SYCL::concurrency()) {}
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken(const UniqueToken&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken(UniqueToken&&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken& operator=(const UniqueToken&) = default;
+
+  KOKKOS_DEFAULTED_FUNCTION
+  UniqueToken& operator=(UniqueToken&&) = default;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const noexcept { return m_count; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type acquire() const {
+    const Kokkos::pair<int, int> result =
+        Kokkos::Impl::concurrent_bitset::acquire_bounded(
+            m_buffer, m_count
+#if defined(KOKKOS_ARCH_INTEL_GEN)
+            ,
+            Kokkos::Impl::clock_tic() % m_count
+#endif
+        );
+
+    if (result.first < 0) {
+      Kokkos::abort(
+          "UniqueToken<SYCL> failure to acquire tokens, no tokens available");
+    }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release(size_type i) const noexcept {
+    Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
+  }
+};
+
+template <>
+class UniqueToken<SYCL, UniqueTokenScope::Instance>
+    : public UniqueToken<SYCL, UniqueTokenScope::Global> {
+  View<uint32_t*, SYCLDeviceUSMSpace> m_buffer_view;
+
+ public:
+  explicit UniqueToken(execution_space const& arg = execution_space())
+      : UniqueToken<SYCL, UniqueTokenScope::Global>(arg) {}
+
+  UniqueToken(size_type max_size, execution_space const& = execution_space())
+      : m_buffer_view(
+            "UniqueToken::m_buffer_view",
+            ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) {
+    m_buffer = m_buffer_view.data();
+    m_count  = max_size;
+  }
+};
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 443ed7b721932fd14ddbd0f3a116d86b62ab4c66..92bd671bd53bf89482aee39cdd34b3391e9a01a2 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -111,7 +111,7 @@ namespace Impl {
 
 void execute_function_noop(ThreadsExec &, const void *) {}
 
-void ThreadsExec::driver(void) {
+void ThreadsExec::driver() {
   SharedAllocationRecord<void, void>::tracking_enable();
 
   ThreadsExec this_thread;
@@ -427,7 +427,7 @@ void ThreadsExec::execute_resize_scratch(ThreadsExec &exec, const void *) {
     // Allocate tracked memory:
     {
       Record *const r =
-          Record::allocate(Kokkos::HostSpace(), "thread_scratch",
+          Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch",
                            s_threads_process.m_scratch_thread_end);
 
       Record::increment(r);
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index fb7736b478197ca79ad2548ab8c195f121d1a4ce..e0ae43dd87ec337d24f659e3da74a662f31dfb84 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -891,14 +891,16 @@ KOKKOS_INLINE_FUNCTION
       thread, count);
 }
 
-template <typename iType>
-KOKKOS_INLINE_FUNCTION
-    Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::ThreadsExecTeamMember>
-    ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread,
-                      const iType& arg_begin, const iType& arg_end) {
+template <typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type,
+    Impl::ThreadsExecTeamMember>
+ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread,
+                  const iType1& arg_begin, const iType2& arg_end) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<iType,
                                                  Impl::ThreadsExecTeamMember>(
-      thread, arg_begin, arg_end);
+      thread, iType(arg_begin), iType(arg_end));
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -1095,6 +1097,27 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
   }
 }
 
+/** \brief  Intra-thread vector parallel scan with reducer
+ *
+ */
+template <typename iType, class FunctorType, typename ReducerType>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                      iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+                  const FunctorType& lambda, const ReducerType& reducer) {
+  typename ReducerType::value_type scan_val;
+  reducer.init(scan_val);
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+}
+
 }  // namespace Kokkos
 
 namespace Kokkos {
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
index eac96998650c707dc8c6ab9a905576e6c781ee88..df09e9e7215310e26d72009cc32f7e5339dfdc5b 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
@@ -47,6 +47,15 @@
 
 #if defined(KOKKOS_ENABLE_CUDA)
 #include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
+#include <Cuda/Kokkos_Cuda_Instance.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+#include <Cuda/Kokkos_Cuda_Team.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Task.hpp>
+#include <Cuda/Kokkos_Cuda_MDRangePolicy.hpp>
+#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
index 769585fc7392300cc01a8d29f494236ed892fd9d..b193d1e741bc19d1725994839c682fa84f2267f9 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp
@@ -48,6 +48,7 @@
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
 #include <Kokkos_OpenMPTarget.hpp>
 #include <Kokkos_OpenMPTargetSpace.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
index 4981435c829881d7aa2e0d0c1ae7165f4e689b0b..92cd85bcae8b9e8c65d37b9308033a0748c8d3aa 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp
@@ -48,9 +48,12 @@
 #if defined(KOKKOS_ENABLE_SYCL)
 #include <Kokkos_SYCL.hpp>
 #include <SYCL/Kokkos_SYCL_DeepCopy.hpp>
+#include <SYCL/Kokkos_SYCL_MDRangePolicy.hpp>
 #include <SYCL/Kokkos_SYCL_Parallel_Range.hpp>
 #include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>
 #include <SYCL/Kokkos_SYCL_Parallel_Scan.hpp>
+#include <SYCL/Kokkos_SYCL_Parallel_Team.hpp>
+#include <SYCL/Kokkos_SYCL_UniqueToken.hpp>
 #endif
 
 #endif
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
index 0465c380cbac1be27ee9769ab7f93f87fb315aa5..1a4e7b482c44b93f87ed981682e3895cf5a534ff 100644
--- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
@@ -48,8 +48,9 @@
 #if defined(KOKKOS_ENABLE_HIP)
 namespace Kokkos {
 namespace Experimental {
-class HIPSpace;  ///< Memory space on HIP GPU
-class HIP;       ///< Execution space for HIP GPU
+class HIPSpace;            ///< Memory space on HIP GPU
+class HIPHostPinnedSpace;  ///< Memory space on Host accessible to HIP GPU
+class HIP;                 ///< Execution space for HIP GPU
 }  // namespace Experimental
 }  // namespace Kokkos
 #endif
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
index fc21b942c29527e8191151c96c547551291409e1..7754daa8a0189a3d0708ce6505955be4b76b2d61 100644
--- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
@@ -48,7 +48,10 @@
 #if defined(KOKKOS_ENABLE_SYCL)
 namespace Kokkos {
 namespace Experimental {
-class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device
+class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device, not accessible from
+                           ///< the host
+class SYCLSharedUSMSpace;  ///< Memory space accessible from both the SYCL
+                           ///< device and the host
 class SYCL;                ///< Execution space for SYCL
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index d9f02b47acaac5a0611878c180bcb93bcc0d57c6..7f72b3983f57c9adea157cf70d815339696cd986 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1582,7 +1582,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1600,7 +1600,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    Tile_Loop_Type<RP::rank, (RP::inner_direction == RP::Left), index_type,
+    Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type,
                    Tag>::apply(m_func, full_tile, m_offset, m_rp.m_tile,
                                m_tiledims);
   }
@@ -1618,7 +1618,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1636,7 +1636,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
@@ -1644,7 +1644,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1653,7 +1653,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 2
 
@@ -1662,7 +1662,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1680,7 +1680,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
@@ -1688,7 +1688,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1697,7 +1697,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 3
 
@@ -1706,7 +1706,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1724,7 +1724,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
@@ -1732,7 +1732,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1741,7 +1741,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 4
 
@@ -1750,7 +1750,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1768,7 +1768,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
@@ -1776,7 +1776,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1785,7 +1785,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 5
 
@@ -1794,7 +1794,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1812,7 +1812,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
@@ -1820,7 +1820,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1829,7 +1829,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 6
 
@@ -1838,7 +1838,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1856,7 +1856,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
@@ -1864,7 +1864,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1873,7 +1873,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 7
 
@@ -1882,7 +1882,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -1900,7 +1900,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
@@ -1908,7 +1908,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -1917,7 +1917,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 8
 #endif
@@ -2003,7 +2003,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2021,7 +2021,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    Tile_Loop_Type<RP::rank, (RP::inner_direction == RP::Left), index_type,
+    Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type,
                    Tag>::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile,
                                m_tiledims);
   }
@@ -2039,7 +2039,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2057,7 +2057,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
@@ -2065,7 +2065,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2074,7 +2074,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 2
 
@@ -2083,7 +2083,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2101,7 +2101,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
@@ -2109,7 +2109,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2118,7 +2118,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 3
 
@@ -2127,7 +2127,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2145,7 +2145,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
@@ -2153,7 +2153,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2162,7 +2162,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 4
 
@@ -2171,7 +2171,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2189,7 +2189,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
@@ -2197,7 +2197,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2206,7 +2206,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 5
 
@@ -2215,7 +2215,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2233,7 +2233,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
@@ -2241,7 +2241,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2250,7 +2250,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 6
 
@@ -2259,7 +2259,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2277,7 +2277,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
@@ -2285,7 +2285,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2294,7 +2294,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 7
 
@@ -2303,7 +2303,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2321,7 +2321,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
@@ -2329,7 +2329,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2338,7 +2338,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 8
 #endif
@@ -2426,7 +2426,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2444,7 +2444,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    Tile_Loop_Type<RP::rank, (RP::inner_direction == RP::Left), index_type,
+    Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type,
                    Tag>::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile,
                                m_tiledims);
   }
@@ -2462,7 +2462,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2480,7 +2480,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
@@ -2488,7 +2488,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2497,7 +2497,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 2
 
@@ -2506,7 +2506,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2524,7 +2524,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
@@ -2532,7 +2532,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2541,7 +2541,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 3
 
@@ -2550,7 +2550,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2568,7 +2568,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
@@ -2576,7 +2576,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2585,7 +2585,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 4
 
@@ -2594,7 +2594,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2612,7 +2612,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
@@ -2620,7 +2620,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2629,7 +2629,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 5
 
@@ -2638,7 +2638,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2656,7 +2656,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
@@ -2664,7 +2664,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2673,7 +2673,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 6
 
@@ -2682,7 +2682,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2700,7 +2700,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
@@ -2708,7 +2708,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2717,7 +2717,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 7
 
@@ -2726,7 +2726,7 @@ struct HostIterateTile<
     point_type m_offset;
     point_type m_tiledims;
 
-    if (RP::outer_direction == RP::Left) {
+    if (RP::outer_direction == Iterate::Left) {
       for (int i = 0; i < RP::rank; ++i) {
         m_offset[i] =
             (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i];
@@ -2744,7 +2744,7 @@ struct HostIterateTile<
     // partial tile dims
     const bool full_tile = check_iteration_bounds(m_tiledims, m_offset);
 
-    if (RP::inner_direction == RP::Left) {
+    if (RP::inner_direction == Iterate::Left) {
       if (full_tile) {
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
@@ -2752,7 +2752,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Left
+    }  // end Iterate::Left
     else {
       if (full_tile) {
         //      #pragma simd
@@ -2761,7 +2761,7 @@ struct HostIterateTile<
         //      #pragma simd
         LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); }
       }
-    }  // end RP::Right
+    }  // end Iterate::Right
 
   }  // end op() rank == 8
 #endif
diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
index 45e53d3a4bc9fe4c439ac9793f8185fa1aa8258c..688afcc107e4e4ff93a2b415c8209d29bf4c0ba2 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
@@ -57,90 +57,73 @@
 namespace Kokkos {
 namespace Impl {
 
+#ifdef KOKKOS_ENABLE_SYCL
+template <typename index_type>
+struct EmulateCUDADim3 {
+  index_type x;
+  index_type y;
+  index_type z;
+};
+#endif
+
+template <class Tag, class Functor, class... Args>
+KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<std::is_void<Tag>::value>
+_tag_invoke(Functor const& f, Args&&... args) {
+  f((Args &&) args...);
+}
+
+template <class Tag, class Functor, class... Args>
+KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<!std::is_void<Tag>::value>
+_tag_invoke(Functor const& f, Args&&... args) {
+  f(Tag{}, (Args &&) args...);
+}
+
+template <class Tag, class Functor, class T, size_t N, size_t... Idxs,
+          class... Args>
+KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array_helper(
+    Functor const& f, T (&vals)[N], std::integer_sequence<size_t, Idxs...>,
+    Args&&... args) {
+  _tag_invoke<Tag>(f, vals[Idxs]..., (Args &&) args...);
+}
+
+template <class Tag, class Functor, class T, size_t N, class... Args>
+KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f,
+                                                        T (&vals)[N],
+                                                        Args&&... args) {
+  _tag_invoke_array_helper<Tag>(f, vals, std::make_index_sequence<N>{},
+                                (Args &&) args...);
+}
+
 // ------------------------------------------------------------------ //
 // ParallelFor iteration pattern
 template <int N, typename PolicyType, typename Functor, typename Tag>
 struct DeviceIterateTile;
 
 // Rank 2
-// Specializations for void tag type
-template <typename PolicyType, typename Functor>
-struct DeviceIterateTile<2, PolicyType, Functor, void> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_)
-      : m_policy(policy_), m_func(f_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      for (index_type tile_id1 = static_cast<index_type>(blockIdx.y);
-           tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) {
-        const index_type offset_1 =
-            tile_id1 * m_policy.m_tile[1] +
-            static_cast<index_type>(threadIdx.y) +
-            static_cast<index_type>(m_policy.m_lower[1]);
-        if (offset_1 < m_policy.m_upper[1] &&
-            static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) {
-          for (index_type tile_id0 = static_cast<index_type>(blockIdx.x);
-               tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) {
-            const index_type offset_0 =
-                tile_id0 * m_policy.m_tile[0] +
-                static_cast<index_type>(threadIdx.x) +
-                static_cast<index_type>(m_policy.m_lower[0]);
-            if (offset_0 < m_policy.m_upper[0] &&
-                static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-              m_func(offset_0, offset_1);
-            }
-          }
-        }
-      }
-    }
-    // LR
-    else {
-      for (index_type tile_id0 = static_cast<index_type>(blockIdx.x);
-           tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) {
-        const index_type offset_0 =
-            tile_id0 * m_policy.m_tile[0] +
-            static_cast<index_type>(threadIdx.x) +
-            static_cast<index_type>(m_policy.m_lower[0]);
-        if (offset_0 < m_policy.m_upper[0] &&
-            static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-          for (index_type tile_id1 = static_cast<index_type>(blockIdx.y);
-               tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) {
-            const index_type offset_1 =
-                tile_id1 * m_policy.m_tile[1] +
-                static_cast<index_type>(threadIdx.y) +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) {
-              m_func(offset_0, offset_1);
-            }
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-};
-
-// Specializations for tag type
 template <typename PolicyType, typename Functor, typename Tag>
 struct DeviceIterateTile<2, PolicyType, Functor, Tag> {
   using index_type = typename PolicyType::index_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_)
       : m_policy(policy_), m_func(f_) {}
+#endif
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
-    if (PolicyType::inner_direction == PolicyType::Left) {
+    if (PolicyType::inner_direction == Iterate::Left) {
       // Loop over size maxnumblocks until full range covered
       for (index_type tile_id1 = static_cast<index_type>(blockIdx.y);
            tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) {
@@ -158,7 +141,7 @@ struct DeviceIterateTile<2, PolicyType, Functor, Tag> {
                 static_cast<index_type>(m_policy.m_lower[0]);
             if (offset_0 < m_policy.m_upper[0] &&
                 static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-              m_func(Tag(), offset_0, offset_1);
+              Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1);
             }
           }
         }
@@ -180,7 +163,7 @@ struct DeviceIterateTile<2, PolicyType, Functor, Tag> {
                 static_cast<index_type>(m_policy.m_lower[1]);
             if (offset_1 < m_policy.m_upper[1] &&
                 static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) {
-              m_func(Tag(), offset_0, offset_1);
+              Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1);
             }
           }
         }
@@ -191,107 +174,38 @@ struct DeviceIterateTile<2, PolicyType, Functor, Tag> {
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
 // Rank 3
-// Specializations for void tag type
-template <typename PolicyType, typename Functor>
-struct DeviceIterateTile<3, PolicyType, Functor, void> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_)
-      : m_policy(policy_), m_func(f_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      for (index_type tile_id2 = static_cast<index_type>(blockIdx.z);
-           tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) {
-        const index_type offset_2 =
-            tile_id2 * m_policy.m_tile[2] +
-            static_cast<index_type>(threadIdx.z) +
-            static_cast<index_type>(m_policy.m_lower[2]);
-        if (offset_2 < m_policy.m_upper[2] &&
-            static_cast<index_type>(threadIdx.z) < m_policy.m_tile[2]) {
-          for (index_type tile_id1 = static_cast<index_type>(blockIdx.y);
-               tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) {
-            const index_type offset_1 =
-                tile_id1 * m_policy.m_tile[1] +
-                static_cast<index_type>(threadIdx.y) +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) {
-              for (index_type tile_id0 = static_cast<index_type>(blockIdx.x);
-                   tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) {
-                const index_type offset_0 =
-                    tile_id0 * m_policy.m_tile[0] +
-                    static_cast<index_type>(threadIdx.x) +
-                    static_cast<index_type>(m_policy.m_lower[0]);
-                if (offset_0 < m_policy.m_upper[0] &&
-                    static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-                  m_func(offset_0, offset_1, offset_2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // LR
-    else {
-      for (index_type tile_id0 = static_cast<index_type>(blockIdx.x);
-           tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) {
-        const index_type offset_0 =
-            tile_id0 * m_policy.m_tile[0] +
-            static_cast<index_type>(threadIdx.x) +
-            static_cast<index_type>(m_policy.m_lower[0]);
-        if (offset_0 < m_policy.m_upper[0] &&
-            static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-          for (index_type tile_id1 = static_cast<index_type>(blockIdx.y);
-               tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) {
-            const index_type offset_1 =
-                tile_id1 * m_policy.m_tile[1] +
-                static_cast<index_type>(threadIdx.y) +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) {
-              for (index_type tile_id2 = static_cast<index_type>(blockIdx.z);
-                   tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) {
-                const index_type offset_2 =
-                    tile_id2 * m_policy.m_tile[2] +
-                    static_cast<index_type>(threadIdx.z) +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    static_cast<index_type>(threadIdx.z) < m_policy.m_tile[2]) {
-                  m_func(offset_0, offset_1, offset_2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-};
-
-// Specializations for void tag type
 template <typename PolicyType, typename Functor, typename Tag>
 struct DeviceIterateTile<3, PolicyType, Functor, Tag> {
   using index_type = typename PolicyType::index_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_)
       : m_policy(policy_), m_func(f_) {}
+#endif
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
-    if (PolicyType::inner_direction == PolicyType::Left) {
+    if (PolicyType::inner_direction == Iterate::Left) {
       for (index_type tile_id2 = static_cast<index_type>(blockIdx.z);
            tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) {
         const index_type offset_2 =
@@ -316,7 +230,7 @@ struct DeviceIterateTile<3, PolicyType, Functor, Tag> {
                     static_cast<index_type>(m_policy.m_lower[0]);
                 if (offset_0 < m_policy.m_upper[0] &&
                     static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) {
-                  m_func(Tag(), offset_0, offset_1, offset_2);
+                  Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, offset_2);
                 }
               }
             }
@@ -348,7 +262,7 @@ struct DeviceIterateTile<3, PolicyType, Functor, Tag> {
                     static_cast<index_type>(m_policy.m_lower[2]);
                 if (offset_2 < m_policy.m_upper[2] &&
                     static_cast<index_type>(threadIdx.z) < m_policy.m_tile[2]) {
-                  m_func(Tag(), offset_0, offset_1, offset_2);
+                  Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, offset_2);
                 }
               }
             }
@@ -361,30 +275,46 @@ struct DeviceIterateTile<3, PolicyType, Functor, Tag> {
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
 // Rank 4
-// Specializations for void tag type
-template <typename PolicyType, typename Functor>
-struct DeviceIterateTile<4, PolicyType, Functor, void> {
+template <typename PolicyType, typename Functor, typename Tag>
+struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
   using index_type = typename PolicyType::index_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_)
       : m_policy(policy_), m_func(f_) {}
+#endif
 
   static constexpr index_type max_blocks = 65535;
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
+    if (PolicyType::inner_direction == Iterate::Left) {
       const index_type temp0  = m_policy.m_tile_end[0];
       const index_type temp1  = m_policy.m_tile_end[1];
       const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
       const index_type numbl1 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl0)
+               ? static_cast<index_type>(max_blocks / numbl0)
                : (temp1 <= max_blocks ? temp1 : max_blocks));
 
       const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
@@ -424,7 +354,8 @@ struct DeviceIterateTile<4, PolicyType, Functor, void> {
                         static_cast<index_type>(m_policy.m_lower[0]);
                     if (offset_0 < m_policy.m_upper[0] &&
                         thr_id0 < m_policy.m_tile[0]) {
-                      m_func(offset_0, offset_1, offset_2, offset_3);
+                      Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                             offset_2, offset_3);
                     }
                   }
                 }
@@ -433,9 +364,7 @@ struct DeviceIterateTile<4, PolicyType, Functor, void> {
           }
         }
       }
-    }
-    // LR
-    else {
+    } else {
       const index_type temp0  = m_policy.m_tile_end[0];
       const index_type temp1  = m_policy.m_tile_end[1];
       const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
@@ -482,7 +411,8 @@ struct DeviceIterateTile<4, PolicyType, Functor, void> {
                     if (offset_3 < m_policy.m_upper[3] &&
                         static_cast<index_type>(threadIdx.z) <
                             m_policy.m_tile[3]) {
-                      m_func(offset_0, offset_1, offset_2, offset_3);
+                      Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                             offset_2, offset_3);
                     }
                   }
                 }
@@ -497,28 +427,47 @@ struct DeviceIterateTile<4, PolicyType, Functor, void> {
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
-// Specializations for void tag type
+// Rank 5
 template <typename PolicyType, typename Functor, typename Tag>
-struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
+struct DeviceIterateTile<5, PolicyType, Functor, Tag> {
   using index_type = typename PolicyType::index_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_)
       : m_policy(policy_), m_func(f_) {}
+#endif
 
   static constexpr index_type max_blocks = 65535;
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      const index_type temp0  = m_policy.m_tile_end[0];
-      const index_type temp1  = m_policy.m_tile_end[1];
+    // LL
+    if (PolicyType::inner_direction == Iterate::Left) {
+      index_type temp0        = m_policy.m_tile_end[0];
+      index_type temp1        = m_policy.m_tile_end[1];
       const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
       const index_type numbl1 =
           (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl0)
+               ? index_type(max_blocks / numbl0)
                : (temp1 <= max_blocks ? temp1 : max_blocks));
 
       const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
@@ -528,37 +477,61 @@ struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
       const index_type thr_id1 =
           static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0];
 
-      for (index_type tile_id3 = static_cast<index_type>(blockIdx.z);
-           tile_id3 < m_policy.m_tile_end[3]; tile_id3 += gridDim.z) {
-        const index_type offset_3 =
-            tile_id3 * m_policy.m_tile[3] +
+      temp0                   = m_policy.m_tile_end[2];
+      temp1                   = m_policy.m_tile_end[3];
+      const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks);
+      const index_type numbl3 =
+          (temp0 * temp1 > max_blocks
+               ? index_type(max_blocks / numbl2)
+               : (temp1 <= max_blocks ? temp1 : max_blocks));
+
+      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2;
+      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2;
+      const index_type thr_id2 =
+          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2];
+      const index_type thr_id3 =
+          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2];
+
+      for (index_type tile_id4 = static_cast<index_type>(blockIdx.z);
+           tile_id4 < m_policy.m_tile_end[4]; tile_id4 += gridDim.z) {
+        const index_type offset_4 =
+            tile_id4 * m_policy.m_tile[4] +
             static_cast<index_type>(threadIdx.z) +
-            static_cast<index_type>(m_policy.m_lower[3]);
-        if (offset_3 < m_policy.m_upper[3] &&
-            static_cast<index_type>(threadIdx.z) < m_policy.m_tile[3]) {
-          for (index_type tile_id2 = static_cast<index_type>(blockIdx.y);
-               tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) {
-            const index_type offset_2 =
-                tile_id2 * m_policy.m_tile[2] +
-                static_cast<index_type>(threadIdx.y) +
-                static_cast<index_type>(m_policy.m_lower[2]);
-            if (offset_2 < m_policy.m_upper[2] &&
-                static_cast<index_type>(threadIdx.y) < m_policy.m_tile[2]) {
-              for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-                   j += numbl1) {
-                const index_type offset_1 =
-                    j * m_policy.m_tile[1] + thr_id1 +
-                    static_cast<index_type>(m_policy.m_lower[1]);
-                if (offset_1 < m_policy.m_upper[1] &&
-                    thr_id1 < m_policy.m_tile[1]) {
-                  for (index_type i = tile_id0; i < m_policy.m_tile_end[0];
-                       i += numbl0) {
-                    const index_type offset_0 =
-                        i * m_policy.m_tile[0] + thr_id0 +
-                        static_cast<index_type>(m_policy.m_lower[0]);
-                    if (offset_0 < m_policy.m_upper[0] &&
-                        thr_id0 < m_policy.m_tile[0]) {
-                      m_func(Tag(), offset_0, offset_1, offset_2, offset_3);
+            static_cast<index_type>(m_policy.m_lower[4]);
+        if (offset_4 < m_policy.m_upper[4] &&
+            static_cast<index_type>(threadIdx.z) < m_policy.m_tile[4]) {
+          for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
+               l += numbl3) {
+            const index_type offset_3 =
+                l * m_policy.m_tile[3] + thr_id3 +
+                static_cast<index_type>(m_policy.m_lower[3]);
+            if (offset_3 < m_policy.m_upper[3] &&
+                thr_id3 < m_policy.m_tile[3]) {
+              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
+                   k += numbl2) {
+                const index_type offset_2 =
+                    k * m_policy.m_tile[2] + thr_id2 +
+                    static_cast<index_type>(m_policy.m_lower[2]);
+                if (offset_2 < m_policy.m_upper[2] &&
+                    thr_id2 < m_policy.m_tile[2]) {
+                  for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
+                       j += numbl1) {
+                    const index_type offset_1 =
+                        j * m_policy.m_tile[1] + thr_id1 +
+                        static_cast<index_type>(m_policy.m_lower[1]);
+                    if (offset_1 < m_policy.m_upper[1] &&
+                        thr_id1 < m_policy.m_tile[1]) {
+                      for (index_type i = tile_id0; i < m_policy.m_tile_end[0];
+                           i += numbl0) {
+                        const index_type offset_0 =
+                            i * m_policy.m_tile[0] + thr_id0 +
+                            static_cast<index_type>(m_policy.m_lower[0]);
+                        if (offset_0 < m_policy.m_upper[0] &&
+                            thr_id0 < m_policy.m_tile[0]) {
+                          Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                                 offset_2, offset_3, offset_4);
+                        }
+                      }
                     }
                   }
                 }
@@ -567,13 +540,15 @@ struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
           }
         }
       }
-    } else {
-      const index_type temp0  = m_policy.m_tile_end[0];
-      const index_type temp1  = m_policy.m_tile_end[1];
-      const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl0 =
+    }
+    // LR
+    else {
+      index_type temp0        = m_policy.m_tile_end[0];
+      index_type temp1        = m_policy.m_tile_end[1];
+      const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
+      const index_type numbl0 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl1)
+               ? static_cast<index_type>(max_blocks / numbl1)
                : (temp0 <= max_blocks ? temp0 : max_blocks));
 
       const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1;
@@ -583,6 +558,21 @@ struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
       const index_type thr_id1 =
           static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1];
 
+      temp0                   = m_policy.m_tile_end[2];
+      temp1                   = m_policy.m_tile_end[3];
+      const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks);
+      const index_type numbl2 =
+          (temp0 * temp1 > max_blocks
+               ? index_type(max_blocks / numbl3)
+               : (temp0 <= max_blocks ? temp0 : max_blocks));
+
+      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3;
+      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3;
+      const index_type thr_id2 =
+          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3];
+      const index_type thr_id3 =
+          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3];
+
       for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) {
         const index_type offset_0 =
             i * m_policy.m_tile[0] + thr_id0 +
@@ -591,30 +581,39 @@ struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
           for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
                j += numbl1) {
             const index_type offset_1 =
-                tile_id1 * m_policy.m_tile[1] + thr_id1 +
+                j * m_policy.m_tile[1] + thr_id1 +
                 static_cast<index_type>(m_policy.m_lower[1]);
             if (offset_1 < m_policy.m_upper[1] &&
                 thr_id1 < m_policy.m_tile[1]) {
-              for (index_type tile_id2 = static_cast<index_type>(blockIdx.y);
-                   tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) {
+              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
+                   k += numbl2) {
                 const index_type offset_2 =
-                    tile_id2 * m_policy.m_tile[2] +
-                    static_cast<index_type>(threadIdx.y) +
+                    k * m_policy.m_tile[2] + thr_id2 +
                     static_cast<index_type>(m_policy.m_lower[2]);
                 if (offset_2 < m_policy.m_upper[2] &&
-                    static_cast<index_type>(threadIdx.y) < m_policy.m_tile[2]) {
-                  for (index_type tile_id3 =
-                           static_cast<index_type>(blockIdx.z);
-                       tile_id3 < m_policy.m_tile_end[3];
-                       tile_id3 += gridDim.z) {
+                    thr_id2 < m_policy.m_tile[2]) {
+                  for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
+                       l += numbl3) {
                     const index_type offset_3 =
-                        tile_id3 * m_policy.m_tile[3] +
-                        static_cast<index_type>(threadIdx.z) +
+                        l * m_policy.m_tile[3] + thr_id3 +
                         static_cast<index_type>(m_policy.m_lower[3]);
                     if (offset_3 < m_policy.m_upper[3] &&
-                        static_cast<index_type>(threadIdx.z) <
-                            m_policy.m_tile[3]) {
-                      m_func(Tag(), offset_0, offset_1, offset_2, offset_3);
+                        thr_id3 < m_policy.m_tile[3]) {
+                      for (index_type tile_id4 =
+                               static_cast<index_type>(blockIdx.z);
+                           tile_id4 < m_policy.m_tile_end[4];
+                           tile_id4 += gridDim.z) {
+                        const index_type offset_4 =
+                            tile_id4 * m_policy.m_tile[4] +
+                            static_cast<index_type>(threadIdx.z) +
+                            static_cast<index_type>(m_policy.m_lower[4]);
+                        if (offset_4 < m_policy.m_upper[4] &&
+                            static_cast<index_type>(threadIdx.z) <
+                                m_policy.m_tile[4]) {
+                          Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                                 offset_2, offset_3, offset_4);
+                        }
+                      }
                     }
                   }
                 }
@@ -629,30 +628,47 @@ struct DeviceIterateTile<4, PolicyType, Functor, Tag> {
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
-// Rank 5
-// Specializations for void tag type
-template <typename PolicyType, typename Functor>
-struct DeviceIterateTile<5, PolicyType, Functor, void> {
+// Rank 6
+template <typename PolicyType, typename Functor, typename Tag>
+struct DeviceIterateTile<6, PolicyType, Functor, Tag> {
   using index_type = typename PolicyType::index_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_)
       : m_policy(policy_), m_func(f_) {}
+#endif
 
   static constexpr index_type max_blocks = 65535;
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
     // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
+    if (PolicyType::inner_direction == Iterate::Left) {
       index_type temp0        = m_policy.m_tile_end[0];
       index_type temp1        = m_policy.m_tile_end[1];
       const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
       const index_type numbl1 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl0)
+               ? static_cast<index_type>(max_blocks / numbl0)
                : (temp1 <= max_blocks ? temp1 : max_blocks));
 
       const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
@@ -667,7 +683,7 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
       const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks);
       const index_type numbl3 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl2)
+               ? static_cast<index_type>(max_blocks / numbl2)
                : (temp1 <= max_blocks ? temp1 : max_blocks));
 
       const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2;
@@ -677,44 +693,66 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
       const index_type thr_id3 =
           static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2];
 
-      for (index_type tile_id4 = static_cast<index_type>(blockIdx.z);
-           tile_id4 < m_policy.m_tile_end[4]; tile_id4 += gridDim.z) {
-        const index_type offset_4 =
-            tile_id4 * m_policy.m_tile[4] +
-            static_cast<index_type>(threadIdx.z) +
-            static_cast<index_type>(m_policy.m_lower[4]);
-        if (offset_4 < m_policy.m_upper[4] &&
-            static_cast<index_type>(threadIdx.z) < m_policy.m_tile[4]) {
-          for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-               l += numbl3) {
-            const index_type offset_3 =
-                l * m_policy.m_tile[3] + thr_id3 +
-                static_cast<index_type>(m_policy.m_lower[3]);
-            if (offset_3 < m_policy.m_upper[3] &&
-                thr_id3 < m_policy.m_tile[3]) {
-              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                   k += numbl2) {
-                const index_type offset_2 =
-                    k * m_policy.m_tile[2] + thr_id2 +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    thr_id2 < m_policy.m_tile[2]) {
-                  for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-                       j += numbl1) {
-                    const index_type offset_1 =
-                        j * m_policy.m_tile[1] + thr_id1 +
-                        static_cast<index_type>(m_policy.m_lower[1]);
-                    if (offset_1 < m_policy.m_upper[1] &&
-                        thr_id1 < m_policy.m_tile[1]) {
-                      for (index_type i = tile_id0; i < m_policy.m_tile_end[0];
-                           i += numbl0) {
-                        const index_type offset_0 =
-                            i * m_policy.m_tile[0] + thr_id0 +
-                            static_cast<index_type>(m_policy.m_lower[0]);
-                        if (offset_0 < m_policy.m_upper[0] &&
-                            thr_id0 < m_policy.m_tile[0]) {
-                          m_func(offset_0, offset_1, offset_2, offset_3,
-                                 offset_4);
+      temp0                   = m_policy.m_tile_end[4];
+      temp1                   = m_policy.m_tile_end[5];
+      const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks);
+      const index_type numbl5 =
+          (temp0 * temp1 > max_blocks
+               ? static_cast<index_type>(max_blocks / numbl4)
+               : (temp1 <= max_blocks ? temp1 : max_blocks));
+
+      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) % numbl4;
+      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) / numbl4;
+      const index_type thr_id4 =
+          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[4];
+      const index_type thr_id5 =
+          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[4];
+
+      for (index_type n = tile_id5; n < m_policy.m_tile_end[5]; n += numbl5) {
+        const index_type offset_5 =
+            n * m_policy.m_tile[5] + thr_id5 +
+            static_cast<index_type>(m_policy.m_lower[5]);
+        if (offset_5 < m_policy.m_upper[5] && thr_id5 < m_policy.m_tile[5]) {
+          for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
+               m += numbl4) {
+            const index_type offset_4 =
+                m * m_policy.m_tile[4] + thr_id4 +
+                static_cast<index_type>(m_policy.m_lower[4]);
+            if (offset_4 < m_policy.m_upper[4] &&
+                thr_id4 < m_policy.m_tile[4]) {
+              for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
+                   l += numbl3) {
+                const index_type offset_3 =
+                    l * m_policy.m_tile[3] + thr_id3 +
+                    static_cast<index_type>(m_policy.m_lower[3]);
+                if (offset_3 < m_policy.m_upper[3] &&
+                    thr_id3 < m_policy.m_tile[3]) {
+                  for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
+                       k += numbl2) {
+                    const index_type offset_2 =
+                        k * m_policy.m_tile[2] + thr_id2 +
+                        static_cast<index_type>(m_policy.m_lower[2]);
+                    if (offset_2 < m_policy.m_upper[2] &&
+                        thr_id2 < m_policy.m_tile[2]) {
+                      for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
+                           j += numbl1) {
+                        const index_type offset_1 =
+                            j * m_policy.m_tile[1] + thr_id1 +
+                            static_cast<index_type>(m_policy.m_lower[1]);
+                        if (offset_1 < m_policy.m_upper[1] &&
+                            thr_id1 < m_policy.m_tile[1]) {
+                          for (index_type i = tile_id0;
+                               i < m_policy.m_tile_end[0]; i += numbl0) {
+                            const index_type offset_0 =
+                                i * m_policy.m_tile[0] + thr_id0 +
+                                static_cast<index_type>(m_policy.m_lower[0]);
+                            if (offset_0 < m_policy.m_upper[0] &&
+                                thr_id0 < m_policy.m_tile[0]) {
+                              Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                                     offset_2, offset_3,
+                                                     offset_4, offset_5);
+                            }
+                          }
                         }
                       }
                     }
@@ -733,7 +771,7 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
       const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
       const index_type numbl0 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl1)
+               ? static_cast<index_type>(max_blocks / numbl1)
                : (temp0 <= max_blocks ? temp0 : max_blocks));
 
       const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1;
@@ -748,7 +786,7 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
       const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks);
       const index_type numbl2 =
           (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl3)
+               ? static_cast<index_type>(max_blocks / numbl3)
                : (temp0 <= max_blocks ? temp0 : max_blocks));
 
       const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3;
@@ -758,6 +796,21 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
       const index_type thr_id3 =
           static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3];
 
+      temp0                   = m_policy.m_tile_end[4];
+      temp1                   = m_policy.m_tile_end[5];
+      const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks);
+      const index_type numbl4 =
+          (temp0 * temp1 > max_blocks
+               ? static_cast<index_type>(max_blocks / numbl5)
+               : (temp0 <= max_blocks ? temp0 : max_blocks));
+
+      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) / numbl5;
+      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) % numbl5;
+      const index_type thr_id4 =
+          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[5];
+      const index_type thr_id5 =
+          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[5];
+
       for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) {
         const index_type offset_0 =
             i * m_policy.m_tile[0] + thr_id0 +
@@ -784,19 +837,25 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
                         static_cast<index_type>(m_policy.m_lower[3]);
                     if (offset_3 < m_policy.m_upper[3] &&
                         thr_id3 < m_policy.m_tile[3]) {
-                      for (index_type tile_id4 =
-                               static_cast<index_type>(blockIdx.z);
-                           tile_id4 < m_policy.m_tile_end[4];
-                           tile_id4 += gridDim.z) {
+                      for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
+                           m += numbl4) {
                         const index_type offset_4 =
-                            tile_id4 * m_policy.m_tile[4] +
-                            static_cast<index_type>(threadIdx.z) +
+                            m * m_policy.m_tile[4] + thr_id4 +
                             static_cast<index_type>(m_policy.m_lower[4]);
                         if (offset_4 < m_policy.m_upper[4] &&
-                            static_cast<index_type>(threadIdx.z) <
-                                m_policy.m_tile[4]) {
-                          m_func(offset_0, offset_1, offset_2, offset_3,
-                                 offset_4);
+                            thr_id4 < m_policy.m_tile[4]) {
+                          for (index_type n = tile_id5;
+                               n < m_policy.m_tile_end[5]; n += numbl5) {
+                            const index_type offset_5 =
+                                n * m_policy.m_tile[5] + thr_id5 +
+                                static_cast<index_type>(m_policy.m_lower[5]);
+                            if (offset_5 < m_policy.m_upper[5] &&
+                                thr_id5 < m_policy.m_tile[5]) {
+                              Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1,
+                                                     offset_2, offset_3,
+                                                     offset_4, offset_5);
+                            }
+                          }
                         }
                       }
                     }
@@ -813,2361 +872,90 @@ struct DeviceIterateTile<5, PolicyType, Functor, void> {
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag>
-struct DeviceIterateTile<5, PolicyType, Functor, Tag> {
-  using index_type = typename PolicyType::index_type;
+// ----------------------------------------------------------------------------------
 
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_)
-      : m_policy(policy_), m_func(f_) {}
+namespace Reduce {
 
-  static constexpr index_type max_blocks = 65535;
+template <typename T>
+using is_void = std::is_same<T, void>;
 
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl1 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl0)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
+template <typename T>
+struct is_array_type : std::false_type {
+  using value_type = T;
+};
 
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0];
+template <typename T>
+struct is_array_type<T*> : std::true_type {
+  using value_type = T;
+};
 
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl3 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl2)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
+template <typename T>
+struct is_array_type<T[]> : std::true_type {
+  using value_type = T;
+};
 
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2];
+// ------------------------------------------------------------------ //
 
-      for (index_type tile_id4 = static_cast<index_type>(blockIdx.z);
-           tile_id4 < m_policy.m_tile_end[4]; tile_id4 += gridDim.z) {
-        const index_type offset_4 =
-            tile_id4 * m_policy.m_tile[4] +
-            static_cast<index_type>(threadIdx.z) +
-            static_cast<index_type>(m_policy.m_lower[4]);
-        if (offset_4 < m_policy.m_upper[4] &&
-            static_cast<index_type>(threadIdx.z) < m_policy.m_tile[4]) {
-          for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-               l += numbl3) {
-            const index_type offset_3 =
-                l * m_policy.m_tile[3] + thr_id3 +
-                static_cast<index_type>(m_policy.m_lower[3]);
-            if (offset_3 < m_policy.m_upper[3] &&
-                thr_id3 < m_policy.m_tile[3]) {
-              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                   k += numbl2) {
-                const index_type offset_2 =
-                    k * m_policy.m_tile[2] + thr_id2 +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    thr_id2 < m_policy.m_tile[2]) {
-                  for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-                       j += numbl1) {
-                    const index_type offset_1 =
-                        j * m_policy.m_tile[1] + thr_id1 +
-                        static_cast<index_type>(m_policy.m_lower[1]);
-                    if (offset_1 < m_policy.m_upper[1] &&
-                        thr_id1 < m_policy.m_tile[1]) {
-                      for (index_type i = tile_id0; i < m_policy.m_tile_end[0];
-                           i += numbl0) {
-                        const index_type offset_0 =
-                            i * m_policy.m_tile[0] + thr_id0 +
-                            static_cast<index_type>(m_policy.m_lower[0]);
-                        if (offset_0 < m_policy.m_upper[0] &&
-                            thr_id0 < m_policy.m_tile[0]) {
-                          m_func(Tag(), offset_0, offset_1, offset_2, offset_3,
-                                 offset_4);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // LR
-    else {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl0 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl1)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1];
-
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl2 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl3)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3];
-
-      for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) {
-        const index_type offset_0 =
-            i * m_policy.m_tile[0] + thr_id0 +
-            static_cast<index_type>(m_policy.m_lower[0]);
-        if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) {
-          for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-               j += numbl1) {
-            const index_type offset_1 =
-                j * m_policy.m_tile[1] + thr_id1 +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                thr_id1 < m_policy.m_tile[1]) {
-              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                   k += numbl2) {
-                const index_type offset_2 =
-                    k * m_policy.m_tile[2] + thr_id2 +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    thr_id2 < m_policy.m_tile[2]) {
-                  for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-                       l += numbl3) {
-                    const index_type offset_3 =
-                        l * m_policy.m_tile[3] + thr_id3 +
-                        static_cast<index_type>(m_policy.m_lower[3]);
-                    if (offset_3 < m_policy.m_upper[3] &&
-                        thr_id3 < m_policy.m_tile[3]) {
-                      for (index_type tile_id4 =
-                               static_cast<index_type>(blockIdx.z);
-                           tile_id4 < m_policy.m_tile_end[4];
-                           tile_id4 += gridDim.z) {
-                        const index_type offset_4 =
-                            tile_id4 * m_policy.m_tile[4] +
-                            static_cast<index_type>(threadIdx.z) +
-                            static_cast<index_type>(m_policy.m_lower[4]);
-                        if (offset_4 < m_policy.m_upper[4] &&
-                            static_cast<index_type>(threadIdx.z) <
-                                m_policy.m_tile[4]) {
-                          m_func(Tag(), offset_0, offset_1, offset_2, offset_3,
-                                 offset_4);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-};
-
-// Rank 6
-// Specializations for void tag type
-template <typename PolicyType, typename Functor>
-struct DeviceIterateTile<6, PolicyType, Functor, void> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_)
-      : m_policy(policy_), m_func(f_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl1 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl0)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0];
-
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl3 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl2)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2];
-
-      temp0                   = m_policy.m_tile_end[4];
-      temp1                   = m_policy.m_tile_end[5];
-      const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl5 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl4)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) % numbl4;
-      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) / numbl4;
-      const index_type thr_id4 =
-          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[4];
-      const index_type thr_id5 =
-          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[4];
-
-      for (index_type n = tile_id5; n < m_policy.m_tile_end[5]; n += numbl5) {
-        const index_type offset_5 =
-            n * m_policy.m_tile[5] + thr_id5 +
-            static_cast<index_type>(m_policy.m_lower[5]);
-        if (offset_5 < m_policy.m_upper[5] && thr_id5 < m_policy.m_tile[5]) {
-          for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
-               m += numbl4) {
-            const index_type offset_4 =
-                m * m_policy.m_tile[4] + thr_id4 +
-                static_cast<index_type>(m_policy.m_lower[4]);
-            if (offset_4 < m_policy.m_upper[4] &&
-                thr_id4 < m_policy.m_tile[4]) {
-              for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-                   l += numbl3) {
-                const index_type offset_3 =
-                    l * m_policy.m_tile[3] + thr_id3 +
-                    static_cast<index_type>(m_policy.m_lower[3]);
-                if (offset_3 < m_policy.m_upper[3] &&
-                    thr_id3 < m_policy.m_tile[3]) {
-                  for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                       k += numbl2) {
-                    const index_type offset_2 =
-                        k * m_policy.m_tile[2] + thr_id2 +
-                        static_cast<index_type>(m_policy.m_lower[2]);
-                    if (offset_2 < m_policy.m_upper[2] &&
-                        thr_id2 < m_policy.m_tile[2]) {
-                      for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-                           j += numbl1) {
-                        const index_type offset_1 =
-                            j * m_policy.m_tile[1] + thr_id1 +
-                            static_cast<index_type>(m_policy.m_lower[1]);
-                        if (offset_1 < m_policy.m_upper[1] &&
-                            thr_id1 < m_policy.m_tile[1]) {
-                          for (index_type i = tile_id0;
-                               i < m_policy.m_tile_end[0]; i += numbl0) {
-                            const index_type offset_0 =
-                                i * m_policy.m_tile[0] + thr_id0 +
-                                static_cast<index_type>(m_policy.m_lower[0]);
-                            if (offset_0 < m_policy.m_upper[0] &&
-                                thr_id0 < m_policy.m_tile[0]) {
-                              m_func(offset_0, offset_1, offset_2, offset_3,
-                                     offset_4, offset_5);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // LR
-    else {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl0 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl1)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1];
-
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl2 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl3)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3];
-
-      temp0                   = m_policy.m_tile_end[4];
-      temp1                   = m_policy.m_tile_end[5];
-      const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl4 =
-          (temp0 * temp1 > max_blocks
-               ? index_type(max_blocks / numbl5)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) / numbl5;
-      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) % numbl5;
-      const index_type thr_id4 =
-          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[5];
-      const index_type thr_id5 =
-          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[5];
-
-      for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) {
-        const index_type offset_0 =
-            i * m_policy.m_tile[0] + thr_id0 +
-            static_cast<index_type>(m_policy.m_lower[0]);
-        if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) {
-          for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-               j += numbl1) {
-            const index_type offset_1 =
-                j * m_policy.m_tile[1] + thr_id1 +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                thr_id1 < m_policy.m_tile[1]) {
-              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                   k += numbl2) {
-                const index_type offset_2 =
-                    k * m_policy.m_tile[2] + thr_id2 +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    thr_id2 < m_policy.m_tile[2]) {
-                  for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-                       l += numbl3) {
-                    const index_type offset_3 =
-                        l * m_policy.m_tile[3] + thr_id3 +
-                        static_cast<index_type>(m_policy.m_lower[3]);
-                    if (offset_3 < m_policy.m_upper[3] &&
-                        thr_id3 < m_policy.m_tile[3]) {
-                      for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
-                           m += numbl4) {
-                        const index_type offset_4 =
-                            m * m_policy.m_tile[4] + thr_id4 +
-                            static_cast<index_type>(m_policy.m_lower[4]);
-                        if (offset_4 < m_policy.m_upper[4] &&
-                            thr_id4 < m_policy.m_tile[4]) {
-                          for (index_type n = tile_id5;
-                               n < m_policy.m_tile_end[5]; n += numbl5) {
-                            const index_type offset_5 =
-                                n * m_policy.m_tile[5] + thr_id5 +
-                                static_cast<index_type>(m_policy.m_lower[5]);
-                            if (offset_5 < m_policy.m_upper[5] &&
-                                thr_id5 < m_policy.m_tile[5]) {
-                              m_func(offset_0, offset_1, offset_2, offset_3,
-                                     offset_4, offset_5);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag>
-struct DeviceIterateTile<6, PolicyType, Functor, Tag> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_)
-      : m_policy(policy_), m_func(f_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    // LL
-    if (PolicyType::inner_direction == PolicyType::Left) {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl1 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl0)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0];
-
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl3 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl2)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2];
-
-      temp0                   = m_policy.m_tile_end[4];
-      temp1                   = m_policy.m_tile_end[5];
-      const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks);
-      const index_type numbl5 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl4)
-               : (temp1 <= max_blocks ? temp1 : max_blocks));
-
-      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) % numbl4;
-      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) / numbl4;
-      const index_type thr_id4 =
-          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[4];
-      const index_type thr_id5 =
-          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[4];
-
-      for (index_type n = tile_id5; n < m_policy.m_tile_end[5]; n += numbl5) {
-        const index_type offset_5 =
-            n * m_policy.m_tile[5] + thr_id5 +
-            static_cast<index_type>(m_policy.m_lower[5]);
-        if (offset_5 < m_policy.m_upper[5] && thr_id5 < m_policy.m_tile[5]) {
-          for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
-               m += numbl4) {
-            const index_type offset_4 =
-                m * m_policy.m_tile[4] + thr_id4 +
-                static_cast<index_type>(m_policy.m_lower[4]);
-            if (offset_4 < m_policy.m_upper[4] &&
-                thr_id4 < m_policy.m_tile[4]) {
-              for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-                   l += numbl3) {
-                const index_type offset_3 =
-                    l * m_policy.m_tile[3] + thr_id3 +
-                    static_cast<index_type>(m_policy.m_lower[3]);
-                if (offset_3 < m_policy.m_upper[3] &&
-                    thr_id3 < m_policy.m_tile[3]) {
-                  for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                       k += numbl2) {
-                    const index_type offset_2 =
-                        k * m_policy.m_tile[2] + thr_id2 +
-                        static_cast<index_type>(m_policy.m_lower[2]);
-                    if (offset_2 < m_policy.m_upper[2] &&
-                        thr_id2 < m_policy.m_tile[2]) {
-                      for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-                           j += numbl1) {
-                        const index_type offset_1 =
-                            j * m_policy.m_tile[1] + thr_id1 +
-                            static_cast<index_type>(m_policy.m_lower[1]);
-                        if (offset_1 < m_policy.m_upper[1] &&
-                            thr_id1 < m_policy.m_tile[1]) {
-                          for (index_type i = tile_id0;
-                               i < m_policy.m_tile_end[0]; i += numbl0) {
-                            const index_type offset_0 =
-                                i * m_policy.m_tile[0] + thr_id0 +
-                                static_cast<index_type>(m_policy.m_lower[0]);
-                            if (offset_0 < m_policy.m_upper[0] &&
-                                thr_id0 < m_policy.m_tile[0]) {
-                              m_func(Tag(), offset_0, offset_1, offset_2,
-                                     offset_3, offset_4, offset_5);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // LR
-    else {
-      index_type temp0        = m_policy.m_tile_end[0];
-      index_type temp1        = m_policy.m_tile_end[1];
-      const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl0 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl1)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1;
-      const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1;
-      const index_type thr_id0 =
-          static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1];
-      const index_type thr_id1 =
-          static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1];
-
-      temp0                   = m_policy.m_tile_end[2];
-      temp1                   = m_policy.m_tile_end[3];
-      const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl2 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl3)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3;
-      const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3;
-      const index_type thr_id2 =
-          static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3];
-      const index_type thr_id3 =
-          static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3];
-
-      temp0                   = m_policy.m_tile_end[4];
-      temp1                   = m_policy.m_tile_end[5];
-      const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks);
-      const index_type numbl4 =
-          (temp0 * temp1 > max_blocks
-               ? static_cast<index_type>(max_blocks / numbl5)
-               : (temp0 <= max_blocks ? temp0 : max_blocks));
-
-      const index_type tile_id4 = static_cast<index_type>(blockIdx.z) / numbl5;
-      const index_type tile_id5 = static_cast<index_type>(blockIdx.z) % numbl5;
-      const index_type thr_id4 =
-          static_cast<index_type>(threadIdx.z) / m_policy.m_tile[5];
-      const index_type thr_id5 =
-          static_cast<index_type>(threadIdx.z) % m_policy.m_tile[5];
-
-      for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) {
-        const index_type offset_0 =
-            i * m_policy.m_tile[0] + thr_id0 +
-            static_cast<index_type>(m_policy.m_lower[0]);
-        if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) {
-          for (index_type j = tile_id1; j < m_policy.m_tile_end[1];
-               j += numbl1) {
-            const index_type offset_1 =
-                j * m_policy.m_tile[1] + thr_id1 +
-                static_cast<index_type>(m_policy.m_lower[1]);
-            if (offset_1 < m_policy.m_upper[1] &&
-                thr_id1 < m_policy.m_tile[1]) {
-              for (index_type k = tile_id2; k < m_policy.m_tile_end[2];
-                   k += numbl2) {
-                const index_type offset_2 =
-                    k * m_policy.m_tile[2] + thr_id2 +
-                    static_cast<index_type>(m_policy.m_lower[2]);
-                if (offset_2 < m_policy.m_upper[2] &&
-                    thr_id2 < m_policy.m_tile[2]) {
-                  for (index_type l = tile_id3; l < m_policy.m_tile_end[3];
-                       l += numbl3) {
-                    const index_type offset_3 =
-                        l * m_policy.m_tile[3] + thr_id3 +
-                        static_cast<index_type>(m_policy.m_lower[3]);
-                    if (offset_3 < m_policy.m_upper[3] &&
-                        thr_id3 < m_policy.m_tile[3]) {
-                      for (index_type m = tile_id4; m < m_policy.m_tile_end[4];
-                           m += numbl4) {
-                        const index_type offset_4 =
-                            m * m_policy.m_tile[4] + thr_id4 +
-                            static_cast<index_type>(m_policy.m_lower[4]);
-                        if (offset_4 < m_policy.m_upper[4] &&
-                            thr_id4 < m_policy.m_tile[4]) {
-                          for (index_type n = tile_id5;
-                               n < m_policy.m_tile_end[5]; n += numbl5) {
-                            const index_type offset_5 =
-                                n * m_policy.m_tile[5] + thr_id5 +
-                                static_cast<index_type>(m_policy.m_lower[5]);
-                            if (offset_5 < m_policy.m_upper[5] &&
-                                thr_id5 < m_policy.m_tile[5]) {
-                              m_func(Tag(), offset_0, offset_1, offset_2,
-                                     offset_3, offset_4, offset_5);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-};
-
-// ----------------------------------------------------------------------------------
-
-namespace Reduce {
-
-template <typename T>
-using is_void = std::is_same<T, void>;
-
-template <typename T>
-struct is_array_type : std::false_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_array_type<T*> : std::true_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_array_type<T[]> : std::true_type {
-  using value_type = T;
-};
-
-// ------------------------------------------------------------------ //
-template <int N, typename PolicyType, typename Functor, typename Tag,
-          typename ValueType, typename Enable = void>
-struct DeviceIterateTile;
-
-// ParallelReduce iteration pattern
-// Scalar reductions
-
-// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of
-// tiles and reduction algorithm constraints extract n-dim tile offsets (i.e.
-// tile's global starting mulit-index) from the tileid = blockid using tile
-// dimensions local indices within a tile extracted from (index_type)threadIdx_x
-// using tile dims, constrained by blocksize combine tile and local id info for
-// multi-dim global ids
-
-// Pattern:
-// Each block+thread is responsible for a tile+local_id combo (additional when
-// striding by num_blocks)
-// 1. create offset arrays
-// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max
-// num blocks)
-// 3. temps set for tile_idx and thrd_idx, which will be modified
-// 4. if LL vs LR:
-//      determine tile starting point offsets (multidim)
-//      determine local index offsets (multidim)
-//      concatentate tile offset + local offset for global multi-dim index
-//    if offset withinin range bounds AND local offset within tile bounds, call
-//    functor
-
-// ValueType = T
-// Rank 2
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    2, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            // Deduce this blocks tile_id
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_v);
-          }
-        }
-      }
-    }
-
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    2, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Rank 3
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    3, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    3, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Rank 4
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    4, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    4, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Rank 5
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    5, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    5, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Rank 6
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    6, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    6, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<!is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                ValueType& v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  ValueType& m_v;
-};
-
-// ValueType = T[], T*
-// Rank 2
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    2, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    2, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_v);
-          }
-        }
-      }  // end for loop over num_tiles - product of tiles in each direction
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Rank 3
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    3, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] =
-                (thrd_idx %
-                 m_policy.m_tile[i]);  // Move this to first computation,
-                                       // add to m_offset right away
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    3, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  DeviceIterateTile(const PolicyType& policy_, const Functor& f_,
-                    value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Rank 4
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    4, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    4, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with (index_type)threadIdx_y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Rank 5
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    5, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    5, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
-
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
-
-// Rank 6
-// Specializations for void tag type
-template <typename PolicyType, typename Functor, typename ValueType>
-struct DeviceIterateTile<
-    6, PolicyType, Functor, void, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
-                                                const Functor& f_,
-                                                value_type* v_)
-      : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
-
-  KOKKOS_IMPL_DEVICE_FUNCTION
-  void exec_range() const {
-    if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles &&
-        static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) {
-      index_type m_offset[PolicyType::rank];  // tile starting global id offset
-      index_type
-          m_local_offset[PolicyType::rank];  // tile starting global id offset
-
-      for (index_type tileidx = static_cast<index_type>(blockIdx.x);
-           tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) {
-        index_type tile_idx =
-            tileidx;  // temp because tile_idx will be modified while
-                      // determining tile starting point offsets
-        index_type thrd_idx = static_cast<index_type>(threadIdx.y);
-        bool in_bounds      = true;
-
-        // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
-          for (int i = 0; i < PolicyType::rank; ++i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
-
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-        // LR
-        else {
-          for (int i = PolicyType::rank - 1; i >= 0; --i) {
-            m_offset[i] =
-                (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
-                m_policy.m_lower[i];
-            tile_idx /= m_policy.m_tile_end[i];
-
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
-            thrd_idx /= m_policy.m_tile[i];
+template <typename T>
+using value_type_storage_t =
+    typename std::conditional_t<is_array_type<T>::value, std::decay<T>,
+                                std::add_lvalue_reference<T> >::type;
 
-            m_offset[i] += m_local_offset[i];
-            if (!(m_offset[i] < m_policy.m_upper[i] &&
-                  m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
-            }
-          }
-          if (in_bounds) {
-            m_func(m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
-          }
-        }
-      }
-    }
-  }  // end exec_range
+// ParallelReduce iteration pattern
+// Scalar reductions
 
- private:
-  const PolicyType& m_policy;
-  const Functor& m_func;
-  value_type* m_v;
-};
+// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of
+// tiles and reduction algorithm constraints extract n-dim tile offsets (i.e.
+// tile's global starting mulit-index) from the tileid = blockid using tile
+// dimensions local indices within a tile extracted from (index_type)threadIdx_x
+// using tile dims, constrained by blocksize combine tile and local id info for
+// multi-dim global ids
 
-// Specializations for tag type
-template <typename PolicyType, typename Functor, typename Tag,
-          typename ValueType>
-struct DeviceIterateTile<
-    6, PolicyType, Functor, Tag, ValueType,
-    typename std::enable_if<is_array_type<ValueType>::value &&
-                            !is_void<Tag>::value>::type> {
-  using index_type = typename PolicyType::index_type;
-  using value_type = typename is_array_type<ValueType>::value_type;
+// Pattern:
+// Each block+thread is responsible for a tile+local_id combo (additional when
+// striding by num_blocks)
+// 1. create offset arrays
+// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max
+// num blocks)
+// 3. temps set for tile_idx and thrd_idx, which will be modified
+// 4. if LL vs LR:
+//      determine tile starting point offsets (multidim)
+//      determine local index offsets (multidim)
+//      concatentate tile offset + local offset for global multi-dim index
+//    if offset withinin range bounds AND local offset within tile bounds, call
+//    functor
 
+template <int N, typename PolicyType, typename Functor, typename Tag,
+          typename ValueType, typename Enable = void>
+struct DeviceIterateTile {
+  using index_type         = typename PolicyType::index_type;
+  using value_type_storage = value_type_storage_t<ValueType>;
+
+#ifdef KOKKOS_ENABLE_SYCL
+  KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(
+      const PolicyType& policy_, const Functor& f_, value_type_storage v_,
+      const EmulateCUDADim3<index_type> gridDim_,
+      const EmulateCUDADim3<index_type> blockIdx_,
+      const EmulateCUDADim3<index_type> threadIdx_)
+      : m_policy(policy_),
+        m_func(f_),
+        m_v(v_),
+        gridDim(gridDim_),
+        blockIdx(blockIdx_),
+        threadIdx(threadIdx_) {}
+#else
   KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_,
                                                 const Functor& f_,
-                                                value_type* v_)
+                                                value_type_storage v_)
       : m_policy(policy_), m_func(f_), m_v(v_) {}
-
-  static constexpr index_type max_blocks = 65535;
+#endif
 
   KOKKOS_IMPL_DEVICE_FUNCTION
   void exec_range() const {
@@ -3186,26 +974,25 @@ struct DeviceIterateTile<
         bool in_bounds      = true;
 
         // LL
-        if (PolicyType::inner_direction == PolicyType::Left) {
+        if (PolicyType::inner_direction == Iterate::Left) {
           for (int i = 0; i < PolicyType::rank; ++i) {
             m_offset[i] =
                 (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] +
                 m_policy.m_lower[i];
             tile_idx /= m_policy.m_tile_end[i];
 
-            // tile-local indices identified with threadIdx.y
+            // tile-local indices identified with (index_type)threadIdx_y
             m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
             thrd_idx /= m_policy.m_tile[i];
 
             m_offset[i] += m_local_offset[i];
             if (!(m_offset[i] < m_policy.m_upper[i] &&
                   m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
+              in_bounds = false;
             }
           }
           if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
+            Impl::_tag_invoke_array<Tag>(m_func, m_offset, m_v);
           }
         }
         // LR
@@ -3216,19 +1003,21 @@ struct DeviceIterateTile<
                 m_policy.m_lower[i];
             tile_idx /= m_policy.m_tile_end[i];
 
-            // tile-local indices identified with threadIdx.y
-            m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]);
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] =
+                (thrd_idx %
+                 m_policy.m_tile[i]);  // Move this to first computation,
+                                       // add to m_offset right away
             thrd_idx /= m_policy.m_tile[i];
 
             m_offset[i] += m_local_offset[i];
             if (!(m_offset[i] < m_policy.m_upper[i] &&
                   m_local_offset[i] < m_policy.m_tile[i])) {
-              in_bounds &= false;
+              in_bounds = false;
             }
           }
           if (in_bounds) {
-            m_func(Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3],
-                   m_offset[4], m_offset[5], m_v);
+            Impl::_tag_invoke_array<Tag>(m_func, m_offset, m_v);
           }
         }
       }
@@ -3238,7 +1027,12 @@ struct DeviceIterateTile<
  private:
   const PolicyType& m_policy;
   const Functor& m_func;
-  value_type* m_v;
+  value_type_storage m_v;
+#ifdef KOKKOS_ENABLE_SYCL
+  const EmulateCUDADim3<index_type> gridDim;
+  const EmulateCUDADim3<index_type> blockIdx;
+  const EmulateCUDADim3<index_type> threadIdx;
+#endif
 };
 
 }  // namespace Reduce
diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index 6905d9e4066ed98d30231232a76b638f43b31634..c513817b5b8cbd74847e180099081bb475020c44 100644
--- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -46,349 +46,91 @@
 #define KOKKOS_IMPL_ANALYZE_POLICY_HPP
 
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_Concepts.hpp>
-#include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_GraphImpl_fwd.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_EBO.hpp>
+#include <Kokkos_Concepts.hpp>  // IndexType
+#include <traits/Kokkos_Traits_fwd.hpp>
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+
+#include <traits/Kokkos_ExecutionSpaceTrait.hpp>
+#include <traits/Kokkos_GraphKernelTrait.hpp>
+#include <traits/Kokkos_IndexTypeTrait.hpp>
+#include <traits/Kokkos_IterationPatternTrait.hpp>
+#include <traits/Kokkos_LaunchBoundsTrait.hpp>
+#include <traits/Kokkos_OccupancyControlTrait.hpp>
+#include <traits/Kokkos_ScheduleTrait.hpp>
+#include <traits/Kokkos_WorkItemPropertyTrait.hpp>
+#include <traits/Kokkos_WorkTagTrait.hpp>
 
 namespace Kokkos {
-namespace Experimental {
-struct DesiredOccupancy {
-  int m_occ = 100;
-  explicit constexpr DesiredOccupancy(int occ) : m_occ(occ) {
-    KOKKOS_EXPECTS(0 <= occ && occ <= 100);
-  }
-  explicit constexpr operator int() const { return m_occ; }
-  constexpr int value() const { return m_occ; }
-  explicit DesiredOccupancy() = default;
-};
-struct MaximizeOccupancy {
-  explicit MaximizeOccupancy() = default;
-};
-}  // namespace Experimental
-
 namespace Impl {
-template <typename ExecutionSpace = void, typename Schedule = void,
-          typename WorkTag = void, typename IndexType = void,
-          typename IterationPattern = void, typename LaunchBounds = void,
-          typename MyWorkItemProperty =
-              Kokkos::Experimental::WorkItemProperty::None_t,
-          typename IsGraphKernel    = std::false_type,
-          typename OccupancyControl = Kokkos::Experimental::MaximizeOccupancy>
-struct PolicyTraitsBase {
-  using type =
-      PolicyTraitsBase<ExecutionSpace, Schedule, WorkTag, IndexType,
-                       IterationPattern, LaunchBounds, MyWorkItemProperty,
-                       IsGraphKernel, OccupancyControl>;
 
-  using execution_space    = ExecutionSpace;
-  using schedule_type      = Schedule;
-  using work_tag           = WorkTag;
-  using index_type         = IndexType;
-  using iteration_pattern  = IterationPattern;
-  using launch_bounds      = LaunchBounds;
-  using work_item_property = MyWorkItemProperty;
-  using is_graph_kernel    = IsGraphKernel;
-  using occupancy_control  = OccupancyControl;
+//------------------------------------------------------------------------------
+
+using execution_policy_trait_specifications =
+    type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait,
+              IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait,
+              ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>;
+
+//------------------------------------------------------------------------------
+// Ignore void for backwards compatibility purposes, though hopefully no one is
+// using this in application code
+template <class... Traits>
+struct AnalyzeExecPolicy<void, void, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
 };
 
-template <typename PolicyBase, typename Property>
-struct SetWorkItemProperty {
-  static_assert(
-      std::is_same<typename PolicyBase::work_item_property,
-                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
-      "Kokkos Error: More than one work item property given");
-  using type = PolicyTraitsBase<
-      typename PolicyBase::execution_space, typename PolicyBase::schedule_type,
-      typename PolicyBase::work_tag, typename PolicyBase::index_type,
-      typename PolicyBase::iteration_pattern,
-      typename PolicyBase::launch_bounds, Property,
-      typename PolicyBase::is_graph_kernel,
-      typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename ExecutionSpace>
-struct SetExecutionSpace {
-  static_assert(is_void<typename PolicyBase::execution_space>::value,
-                "Kokkos Error: More than one execution space given");
-  using type =
-      PolicyTraitsBase<ExecutionSpace, typename PolicyBase::schedule_type,
-                       typename PolicyBase::work_tag,
-                       typename PolicyBase::index_type,
-                       typename PolicyBase::iteration_pattern,
-                       typename PolicyBase::launch_bounds,
-                       typename PolicyBase::work_item_property,
-                       typename PolicyBase::is_graph_kernel,
-                       typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename Schedule>
-struct SetSchedule {
-  static_assert(is_void<typename PolicyBase::schedule_type>::value,
-                "Kokkos Error: More than one schedule type given");
-  using type = PolicyTraitsBase<typename PolicyBase::execution_space, Schedule,
-                                typename PolicyBase::work_tag,
-                                typename PolicyBase::index_type,
-                                typename PolicyBase::iteration_pattern,
-                                typename PolicyBase::launch_bounds,
-                                typename PolicyBase::work_item_property,
-                                typename PolicyBase::is_graph_kernel,
-                                typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename WorkTag>
-struct SetWorkTag {
-  static_assert(is_void<typename PolicyBase::work_tag>::value,
-                "Kokkos Error: More than one work tag given");
-  using type = PolicyTraitsBase<typename PolicyBase::execution_space,
-                                typename PolicyBase::schedule_type, WorkTag,
-                                typename PolicyBase::index_type,
-                                typename PolicyBase::iteration_pattern,
-                                typename PolicyBase::launch_bounds,
-                                typename PolicyBase::work_item_property,
-                                typename PolicyBase::is_graph_kernel,
-                                typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename IndexType>
-struct SetIndexType {
-  static_assert(is_void<typename PolicyBase::index_type>::value,
-                "Kokkos Error: More than one index type given");
-  using type = PolicyTraitsBase<typename PolicyBase::execution_space,
-                                typename PolicyBase::schedule_type,
-                                typename PolicyBase::work_tag, IndexType,
-                                typename PolicyBase::iteration_pattern,
-                                typename PolicyBase::launch_bounds,
-                                typename PolicyBase::work_item_property,
-                                typename PolicyBase::is_graph_kernel,
-                                typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename IterationPattern>
-struct SetIterationPattern {
-  static_assert(is_void<typename PolicyBase::iteration_pattern>::value,
-                "Kokkos Error: More than one iteration_pattern given");
-  using type = PolicyTraitsBase<
-      typename PolicyBase::execution_space, typename PolicyBase::schedule_type,
-      typename PolicyBase::work_tag, typename PolicyBase::index_type,
-      IterationPattern, typename PolicyBase::launch_bounds,
-      typename PolicyBase::work_item_property,
-      typename PolicyBase::is_graph_kernel,
-      typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename LaunchBounds>
-struct SetLaunchBounds {
-  static_assert(is_void<typename PolicyBase::launch_bounds>::value,
-                "Kokkos Error: More than one launch_bounds given");
-  using type = PolicyTraitsBase<
-      typename PolicyBase::execution_space, typename PolicyBase::schedule_type,
-      typename PolicyBase::work_tag, typename PolicyBase::index_type,
-      typename PolicyBase::iteration_pattern, LaunchBounds,
-      typename PolicyBase::work_item_property,
-      typename PolicyBase::is_graph_kernel,
-      typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase>
-struct SetIsGraphKernel {
-  using type = PolicyTraitsBase<
-      typename PolicyBase::execution_space, typename PolicyBase::schedule_type,
-      typename PolicyBase::work_tag, typename PolicyBase::index_type,
-      typename PolicyBase::iteration_pattern,
-      typename PolicyBase::launch_bounds,
-      typename PolicyBase::work_item_property, std::true_type,
-      typename PolicyBase::occupancy_control>;
-};
-
-template <typename PolicyBase, typename OccupancyControl>
-struct SetOccupancyControl {
-  using type = PolicyTraitsBase<
-      typename PolicyBase::execution_space, typename PolicyBase::schedule_type,
-      typename PolicyBase::work_tag, typename PolicyBase::index_type,
-      typename PolicyBase::iteration_pattern,
-      typename PolicyBase::launch_bounds,
-      typename PolicyBase::work_item_property,
-      typename PolicyBase::is_graph_kernel, OccupancyControl>;
-};
-
-template <typename Base, typename... Traits>
-struct AnalyzePolicy;
-
-// TODO DSH rewrite this to be more extensible once we have metaprogramming from
-//      desul
-template <typename Base, typename T, typename... Traits>
-struct AnalyzePolicy<Base, T, Traits...>
-    : public AnalyzePolicy<
-          typename std::conditional_t<
-              is_execution_space<T>::value, SetExecutionSpace<Base, T>,
-              std::conditional_t<
-                  is_schedule_type<T>::value, SetSchedule<Base, T>,
-                  std::conditional_t<
-                      is_index_type<T>::value, SetIndexType<Base, T>,
-                      std::conditional_t<
-                          std::is_integral<T>::value,
-                          SetIndexType<Base, IndexType<T>>,
-                          std::conditional_t<
-                              is_iteration_pattern<T>::value,
-                              SetIterationPattern<Base, T>,
-                              std::conditional_t<
-                                  is_launch_bounds<T>::value,
-                                  SetLaunchBounds<Base, T>,
-                                  std::conditional_t<
-                                      Kokkos::Experimental::
-                                          is_work_item_property<T>::value,
-                                      SetWorkItemProperty<Base, T>,
-                                      std::conditional_t<
-                                          std::is_same<T,
-                                                       IsGraphKernelTag>::value,
-                                          SetIsGraphKernel<Base>,
-                                          std::conditional_t<
-                                              std::is_same<
-                                                  T, Kokkos::Experimental::
-                                                         DesiredOccupancy>::
-                                                      value ||
-                                                  std::is_same<
-                                                      T,
-                                                      Kokkos::Experimental::
-                                                          MaximizeOccupancy>::
-                                                      value,
-                                              SetOccupancyControl<Base, T>,
-                                              std::conditional_t<
-                                                  !std::is_void<T>::value,
-                                                  SetWorkTag<Base, T>,
-                                                  Base>>>>>>>>>>::type,
-          Traits...> {};
-
-template <typename Base>
-struct AnalyzePolicy<Base> {
-  static constexpr auto execution_space_is_defaulted =
-      std::is_void<typename Base::execution_space>::value;
-  using execution_space =
-      typename std::conditional<execution_space_is_defaulted,
-                                DefaultExecutionSpace,
-                                typename Base::execution_space>::type;
-
-  using schedule_type =
-      typename std::conditional<is_void<typename Base::schedule_type>::value,
-                                Schedule<Static>,
-                                typename Base::schedule_type>::type;
-
-  using work_tag = typename Base::work_tag;
-
-  using index_type =
-      typename std::conditional<is_void<typename Base::index_type>::value,
-                                IndexType<typename execution_space::size_type>,
-                                typename Base::index_type>::type::type;
-  // nasty hack to make index_type into an integral_type
-  // instead of the wrapped IndexType<T> for backwards compatibility
-
-  using iteration_pattern = typename std::conditional<
-      is_void<typename Base::iteration_pattern>::value,
-      void  // TODO set default iteration pattern
-      ,
-      typename Base::iteration_pattern>::type;
-
-  using launch_bounds =
-      typename std::conditional<is_void<typename Base::launch_bounds>::value,
-                                LaunchBounds<>,
-                                typename Base::launch_bounds>::type;
-
-  using work_item_property = typename Base::work_item_property;
-
-  using is_graph_kernel = typename Base::is_graph_kernel;
-
-  using occupancy_control = typename Base::occupancy_control;
-
-  using type =
-      PolicyTraitsBase<execution_space, schedule_type, work_tag, index_type,
-                       iteration_pattern, launch_bounds, work_item_property,
-                       is_graph_kernel, occupancy_control>;
-};
-
-template <class AnalyzedPolicy>
-struct PolicyDataStorage : AnalyzedPolicy,
-                           NoUniqueAddressMemberEmulation<
-                               typename AnalyzedPolicy::occupancy_control> {
-  using occupancy_control_t = typename AnalyzedPolicy::occupancy_control;
-
-  using occupancy_control_storage_base_t =
-      NoUniqueAddressMemberEmulation<occupancy_control_t>;
-
-  static constexpr bool experimental_contains_desired_occupancy =
-      std::is_same<occupancy_control_t,
-                   Kokkos::Experimental::DesiredOccupancy>::value;
-
-  PolicyDataStorage() = default;
-
-  // Converting constructors
-  template <
-      class Other,
-      std::enable_if_t<
-          experimental_contains_desired_occupancy &&
-              PolicyDataStorage<Other>::experimental_contains_desired_occupancy,
-          int> = 0>
-  PolicyDataStorage(PolicyDataStorage<Other> const &other) {
-    this->impl_set_desired_occupancy(other.impl_get_desired_occupancy());
-  }
-
-  template <class Other,
-            std::enable_if_t<!experimental_contains_desired_occupancy ||
-                                 !PolicyDataStorage<Other>::
-                                     experimental_contains_desired_occupancy,
-                             int> = 0>
-  PolicyDataStorage(PolicyDataStorage<Other> const &) {}
-
-  // Converting assignment operators
-  template <
-      class Other,
-      std::enable_if_t<
-          experimental_contains_desired_occupancy &&
-              PolicyDataStorage<Other>::experimental_contains_desired_occupancy,
-          int> = 0>
-  PolicyDataStorage &operator=(PolicyDataStorage<Other> const &other) {
-    this->impl_set_desired_occupancy(other.impl_get_desired_occupancy());
-    return *this;
-  }
-
-  template <class Other,
-            std::enable_if_t<!experimental_contains_desired_occupancy ||
-                                 !PolicyDataStorage<Other>::
-                                     experimental_contains_desired_occupancy,
-                             int> = 0>
-  PolicyDataStorage &operator=(PolicyDataStorage<Other> const &) {
+//------------------------------------------------------------------------------
+// Mix in the defaults (base_traits) for the traits that aren't yet handled
+
+template <class TraitSpecList>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
+template <class... TraitSpecifications>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
+    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
+    : TraitSpecifications::base_traits... {};
+
+template <>
+struct AnalyzeExecPolicy<void>
+    : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> {
+  // Ensure default constructibility since a converting constructor causes it to
+  // be deleted.
+  AnalyzeExecPolicy() = default;
+
+  // Base converting constructor and assignment operator: unless an individual
+  // policy analysis deletes a constructor, assume it's convertible
+  template <class Other>
+  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const&) {}
+
+  template <class Other>
+  AnalyzeExecPolicy& operator=(ExecPolicyTraitsWithDefaults<Other> const&) {
     return *this;
   }
+};
 
-  // Access to desired occupancy (getter and setter)
-  template <class Dummy = occupancy_control_t>
-  std::enable_if_t<std::is_same<Dummy, occupancy_control_t>::value &&
-                       experimental_contains_desired_occupancy,
-                   Kokkos::Experimental::DesiredOccupancy>
-  impl_get_desired_occupancy() const {
-    return this
-        ->occupancy_control_storage_base_t::no_unique_address_data_member();
-  }
-
-  template <class Dummy = occupancy_control_t>
-  std::enable_if_t<std::is_same<Dummy, occupancy_control_t>::value &&
-                   experimental_contains_desired_occupancy>
-  impl_set_desired_occupancy(occupancy_control_t desired_occupancy) {
-    this->occupancy_control_storage_base_t::no_unique_address_data_member() =
-        desired_occupancy;
-  }
+//------------------------------------------------------------------------------
+// Used for defaults that depend on other analysis results
+template <class AnalysisResults>
+struct ExecPolicyTraitsWithDefaults : AnalysisResults {
+  using base_t = AnalysisResults;
+  using base_t::base_t;
+  // The old code turned this into an integral type for backwards compatibility,
+  // so that's what we're doing here. The original comment was:
+  //   nasty hack to make index_type into an integral_type
+  //   instead of the wrapped IndexType<T> for backwards compatibility
+  using index_type = typename std::conditional_t<
+      base_t::index_type_is_defaulted,
+      Kokkos::IndexType<typename base_t::execution_space::size_type>,
+      typename base_t::index_type>::type;
 };
 
+//------------------------------------------------------------------------------
 template <typename... Traits>
 struct PolicyTraits
-    : PolicyDataStorage<
-          typename AnalyzePolicy<PolicyTraitsBase<>, Traits...>::type> {
-  using base_t = PolicyDataStorage<
-      typename AnalyzePolicy<PolicyTraitsBase<>, Traits...>::type>;
-  template <class... Args>
-  PolicyTraits(PolicyTraits<Args...> const &p) : base_t(p) {}
-  PolicyTraits() = default;
+    : ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>> {
+  using base_t =
+      ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>>;
+  using base_t::base_t;
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index 8ed130d15fcabb443dbb693ce6240ffb2ab7dd58..f2c1c756a910d26de0eb3765e0b90684e564d243 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -191,8 +191,7 @@ inline T atomic_exchange(volatile T* const dest,
                          typename std::enable_if<sizeof(T) == sizeof(int) ||
                                                      sizeof(T) == sizeof(long),
                                                  const T&>::type val) {
-  using type =
-      typename Kokkos::Impl::if_c<sizeof(T) == sizeof(int), int, long>::type;
+  using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
@@ -285,8 +284,7 @@ inline void atomic_assign(volatile T* const dest,
                           typename std::enable_if<sizeof(T) == sizeof(int) ||
                                                       sizeof(T) == sizeof(long),
                                                   const T&>::type val) {
-  using type =
-      typename Kokkos::Impl::if_c<sizeof(T) == sizeof(int), int, long>::type;
+  using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
 
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 6b9e4a4a250518826ed5341ee5f461fc4c021d38..28ac7a3bab9e748f9d315ca479f57db885ed75c4 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -345,7 +345,7 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
   return return_val;
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
   // FIXME_SYCL
-  std::abort();
+  Kokkos::abort("Not implemented!");
   (void)op;
   (void)dest;
   (void)val;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp
index 8a886d0a7757ae085a223511162a0ed956dd6d6a..7338a5c545f25f58662c2e05c4a20bda4992e203 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp
@@ -101,6 +101,52 @@ inline __host__ unsigned long long int atomic_fetch_max(
 
 #endif
 
+#if (350 > __CUDA_ARCH__)
+
+// Fallback for atomic{Min,Max} for Kepler
+
+inline __device__ int atomic_fetch_min(volatile int* const dest,
+                                       const int val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<const int, const int>(), dest,
+                                 val);
+}
+
+inline __device__ unsigned int atomic_fetch_min(
+    volatile unsigned int* const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(
+      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
+}
+
+inline __device__ unsigned long long int atomic_fetch_min(
+    volatile unsigned long long int* const dest,
+    const unsigned long long int val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<const unsigned long long int,
+                                               const unsigned long long int>(),
+                                 dest, val);
+}
+
+inline __device__ int atomic_fetch_max(volatile int* const dest,
+                                       const int val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<const int, const int>(), dest,
+                                 val);
+}
+
+inline __device__ unsigned int atomic_fetch_max(
+    volatile unsigned int* const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(
+      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
+}
+
+inline __device__ unsigned long long int atomic_fetch_max(
+    volatile unsigned long long int* const dest,
+    const unsigned long long int val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<const unsigned long long int,
+                                               const unsigned long long int>(),
+                                 dest, val);
+}
+
+#else  // Supported by devices of compute capability 3.5 and higher
+
 inline __device__ int atomic_fetch_min(volatile int* const dest,
                                        const int val) {
   return atomicMin((int*)dest, val);
@@ -133,6 +179,8 @@ inline __device__ unsigned long long int atomic_fetch_max(
   return atomicMax((unsigned long long int*)dest, val);
 }
 
+#endif
+
 // Atomic_{min,max}_fetch
 
 #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
@@ -178,6 +226,52 @@ inline __host__ unsigned long long int atomic_max_fetch(
 }
 #endif
 
+#if (350 > __CUDA_ARCH__)
+
+// Fallback for atomic{Min,Max} for Kepler
+
+inline __device__ int atomic_min_fetch(volatile int* const dest,
+                                       const int val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<const int, const int>(), dest,
+                                 val);
+}
+
+inline __device__ unsigned int atomic_min_fetch(
+    volatile unsigned int* const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(
+      Impl::MinOper<const unsigned int, const unsigned int>(), dest, val);
+}
+
+inline __device__ unsigned long long int atomic_min_fetch(
+    volatile unsigned long long int* const dest,
+    const unsigned long long int val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<const unsigned long long int,
+                                               const unsigned long long int>(),
+                                 dest, val);
+}
+
+inline __device__ int atomic_max_fetch(volatile int* const dest,
+                                       const int val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<const int, const int>(), dest,
+                                 val);
+}
+
+inline __device__ unsigned int atomic_max_fetch(
+    volatile unsigned int* const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(
+      Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val);
+}
+
+inline __device__ unsigned long long int atomic_max_fetch(
+    volatile unsigned long long int* const dest,
+    const unsigned long long int val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<const unsigned long long int,
+                                               const unsigned long long int>(),
+                                 dest, val);
+}
+
+#else  // Supported by devices of compute capability 3.5 and higher
+
 inline __device__ int atomic_min_fetch(volatile int* const dest,
                                        const int val) {
   const int old = atomicMin((int*)dest, val);
@@ -216,6 +310,8 @@ inline __device__ unsigned long long int atomic_max_fetch(
   return old >= val ? old : val;
 }
 
+#endif
+
 #endif
 #endif
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 3916a1b03d58ba718ce9492b5270b071bcc4b55b..975318b7dde67a1d1569c3cf657060c3ae18215d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -299,14 +299,18 @@ class AtomicDataElement {
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator==(const_value_type& val) const { return *ptr == val; }
+  bool operator==(const AtomicDataElement& val) const { return *ptr == val; }
   KOKKOS_INLINE_FUNCTION
-  bool operator==(volatile const_value_type& val) const { return *ptr == val; }
+  bool operator==(volatile const AtomicDataElement& val) const {
+    return *ptr == val;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const_value_type& val) const { return *ptr != val; }
+  bool operator!=(const AtomicDataElement& val) const { return *ptr != val; }
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(volatile const_value_type& val) const { return *ptr != val; }
+  bool operator!=(volatile const AtomicDataElement& val) const {
+    return *ptr != val;
+  }
 
   KOKKOS_INLINE_FUNCTION
   bool operator>=(const_value_type& val) const { return *ptr >= val; }
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 8c2d53ba14628bfbd075155a846794fef5d13728..4e46b8d157f83129182d4db9b725bcddbe3ed28b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -52,6 +52,15 @@
 #include <omp.h>
 #endif
 
+// To use OpenCL(TM) built-in intrinsics inside kernels, we have to
+// forward-declare their prototype, also see
+// https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+    defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL unsigned long __attribute__((overloadable))
+intel_get_cycle_counter();
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -69,13 +78,16 @@ namespace Impl {
  */
 
 KOKKOS_FORCEINLINE_FUNCTION
-uint64_t clock_tic(void) noexcept {
+uint64_t clock_tic() noexcept {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 
   // Return value of 64-bit hi-res clock register.
 
   return clock64();
 
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+    defined(__SYCL_DEVICE_ONLY__)
+  return intel_get_cycle_counter();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
   return uint64_t(omp_get_wtime() * 1.e9);
 #elif defined(__i386__) || defined(__x86_64)
diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 61c1375900ecb88add73b0e8a07156b4d6bb94d4..06681a95ae902c613c701cd78ff572d35da6c0a1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -53,6 +53,8 @@
 #include <Kokkos_AnonymousSpace.hpp>
 #include <impl/Kokkos_Utilities.hpp>  // comma operator fold emulation
 
+#include <utility>
+
 namespace Kokkos {
 namespace Impl {
 
@@ -99,7 +101,7 @@ template <class IdxSeq, class... ValueTypes>
 struct CombinedReducerValueImpl;
 
 template <size_t... Idxs, class... ValueTypes>
-struct CombinedReducerValueImpl<integer_sequence<size_t, Idxs...>,
+struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
                                 ValueTypes...>
     : CombinedReducerValueItemImpl<Idxs, ValueTypes>... {
  public:
@@ -220,14 +222,15 @@ template <class IdxSeq, class Space, class...>
 struct CombinedReducerImpl;
 
 template <size_t... Idxs, class Space, class... Reducers>
-struct CombinedReducerImpl<integer_sequence<size_t, Idxs...>, Space,
+struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
                            Reducers...>
     : private CombinedReducerStorageImpl<Idxs, Reducers>... {
  public:
-  using reducer = CombinedReducerImpl<integer_sequence<size_t, Idxs...>, Space,
-                                      Reducers...>;
-  using value_type = CombinedReducerValueImpl<integer_sequence<size_t, Idxs...>,
-                                              typename Reducers::value_type...>;
+  using reducer = CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>,
+                                      Space, Reducers...>;
+  using value_type =
+      CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
+                               typename Reducers::value_type...>;
   using result_view_type =
       Kokkos::View<value_type, Space, Kokkos::MemoryUnmanaged>;
 
@@ -309,10 +312,11 @@ struct CombinedReducerImpl<integer_sequence<size_t, Idxs...>, Space,
 // thing.
 template <class Space, class... Reducers>
 struct CombinedReducer
-    : CombinedReducerImpl<make_index_sequence<sizeof...(Reducers)>, Space,
+    : CombinedReducerImpl<std::make_index_sequence<sizeof...(Reducers)>, Space,
                           Reducers...> {
-  using base_t = CombinedReducerImpl<make_index_sequence<sizeof...(Reducers)>,
-                                     Space, Reducers...>;
+  using base_t =
+      CombinedReducerImpl<std::make_index_sequence<sizeof...(Reducers)>, Space,
+                          Reducers...>;
   using base_t::base_t;
   using reducer = CombinedReducer<Space, Reducers...>;
 };
@@ -327,8 +331,8 @@ template <class IdxSeq, class Functor, class Space, class... Reducers>
 struct CombinedReductionFunctorWrapperImpl;
 
 template <size_t... Idxs, class Functor, class Space, class... Reducers>
-struct CombinedReductionFunctorWrapperImpl<integer_sequence<size_t, Idxs...>,
-                                           Functor, Space, Reducers...> {
+struct CombinedReductionFunctorWrapperImpl<
+    std::integer_sequence<size_t, Idxs...>, Functor, Space, Reducers...> {
  private:
   Functor m_functor;
 
@@ -425,10 +429,11 @@ struct CombinedReductionFunctorWrapperImpl<integer_sequence<size_t, Idxs...>,
 template <class Functor, class Space, class... Reducers>
 struct CombinedReductionFunctorWrapper
     : CombinedReductionFunctorWrapperImpl<
-          make_index_sequence<sizeof...(Reducers)>, Functor, Space,
+          std::make_index_sequence<sizeof...(Reducers)>, Functor, Space,
           Reducers...> {
   using base_t = CombinedReductionFunctorWrapperImpl<
-      make_index_sequence<sizeof...(Reducers)>, Functor, Space, Reducers...>;
+      std::make_index_sequence<sizeof...(Reducers)>, Functor, Space,
+      Reducers...>;
   using base_t::base_t;
 };
 
@@ -488,11 +493,8 @@ using _reducer_from_arg_t =
 //------------------------------------------------------------------------------
 
 template <class Space, class... ReferencesOrViewsOrReducers>
-KOKKOS_INLINE_FUNCTION constexpr CombinedReducerValueImpl<
-    make_index_sequence<sizeof...(ReferencesOrViewsOrReducers)>,
-    typename _reducer_from_arg_t<Space,
-                                 ReferencesOrViewsOrReducers>::value_type...>
-make_combined_reducer_value(ReferencesOrViewsOrReducers&&... args) {
+KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value(
+    ReferencesOrViewsOrReducers&&... args) {
   //----------------------------------------
   // This is a bit round-about and we should make sure it doesn't have
   // any performance implications. Basically, we make a reducer out of anything
@@ -500,7 +502,7 @@ make_combined_reducer_value(ReferencesOrViewsOrReducers&&... args) {
   // compilers should figure out what's going on, but we should double-check
   // that.
   return CombinedReducerValueImpl<
-      make_index_sequence<sizeof...(ReferencesOrViewsOrReducers)>,
+      std::make_index_sequence<sizeof...(ReferencesOrViewsOrReducers)>,
       typename _reducer_from_arg_t<Space,
                                    ReferencesOrViewsOrReducers>::value_type...>{
       // This helper function is now poorly named after refactoring.
@@ -510,9 +512,8 @@ make_combined_reducer_value(ReferencesOrViewsOrReducers&&... args) {
 }
 
 template <class Space, class ValueType, class... ReferencesOrViewsOrReducers>
-KOKKOS_INLINE_FUNCTION constexpr CombinedReducer<
-    Space, _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>...>
-make_combined_reducer(ValueType& value, ReferencesOrViewsOrReducers&&... args) {
+KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer(
+    ValueType& value, ReferencesOrViewsOrReducers&&... args) {
   //----------------------------------------
   // This is doing more or less the same thing of making every argument into
   // a reducer, just in a different place than in `make_combined_reducer_value`,
@@ -526,10 +527,8 @@ make_combined_reducer(ValueType& value, ReferencesOrViewsOrReducers&&... args) {
 }
 
 template <class Functor, class Space, class... ReferencesOrViewsOrReducers>
-KOKKOS_INLINE_FUNCTION constexpr CombinedReductionFunctorWrapper<
-    Functor, Space, _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>...>
-make_wrapped_combined_functor(Functor const& functor, Space,
-                              ReferencesOrViewsOrReducers&&...) {
+KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor(
+    Functor const& functor, Space, ReferencesOrViewsOrReducers&&...) {
   //----------------------------------------
   return CombinedReductionFunctorWrapper<
       Functor, Space,
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index d6dc384f2ff6f98c82736b5c603810706cee1ff7..b4769fbeaa53be8353df315ede634708da1b297d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -54,6 +54,7 @@
 #include <functional>
 #include <list>
 #include <cerrno>
+#include <regex>
 #ifndef _WIN32
 #include <unistd.h>
 #else
@@ -80,6 +81,23 @@ std::stack<hook_function_type, std::list<hook_function_type>> finalize_hooks;
 
 namespace Kokkos {
 namespace Impl {
+/**
+ * The category is only used in printing, tools
+ * get all metadata free of category
+ */
+using metadata_category_type = std::string;
+using metadata_key_type      = std::string;
+using metadata_value_type    = std::string;
+
+std::map<metadata_category_type,
+         std::map<metadata_key_type, metadata_value_type>>
+    metadata_map;
+
+void declare_configuration_metadata(const std::string& category,
+                                    const std::string& key,
+                                    const std::string& value) {
+  metadata_map[category][key] = value;
+}
 
 ExecSpaceManager& ExecSpaceManager::get_instance() {
   static ExecSpaceManager space_initializer = {};
@@ -210,8 +228,19 @@ int get_ctest_gpu(const char* local_rank_str) {
 
 // function to extract gpu # from args
 int get_gpu(const InitArguments& args) {
-  int use_gpu           = args.device_id;
-  const int ndevices    = args.ndevices;
+  int use_gpu        = args.device_id;
+  const int ndevices = [](int num_devices) -> int {
+    if (num_devices > 0) return num_devices;
+#if defined(KOKKOS_ENABLE_CUDA)
+    return Cuda::detect_device_count();
+#elif defined(KOKKOS_ENABLE_HIP)
+    return Experimental::HIP::detect_device_count();
+#elif defined(KOKKOS_ENABLE_SYCL)
+    return sycl::device::get_devices(sycl::info::device_type::gpu).size();
+#else
+    return num_devices;
+#endif
+  }(args.ndevices);
   const int skip_device = args.skip_device;
 
   // if the exact device is not set, but ndevices was given, assign round-robin
@@ -232,7 +261,7 @@ int get_gpu(const InitArguments& args) {
         local_rank_str) {
       // Use the device assigned by CTest
       use_gpu = get_ctest_gpu(local_rank_str);
-    } else if (ndevices >= 0) {
+    } else if (ndevices > 0) {
       // Use the device assigned by the rank
       if (local_rank_str) {
         auto local_rank = std::stoi(local_rank_str);
@@ -270,13 +299,221 @@ void initialize_backends(const InitArguments& args) {
   Impl::ExecSpaceManager::get_instance().initialize_spaces(args);
 }
 
-void initialize_profiling(const InitArguments&) {
-  Kokkos::Profiling::initialize();
+void initialize_profiling(const InitArguments& args) {
+  Kokkos::Profiling::initialize(args.tool_lib);
+  if (args.tool_help) {
+    if (!Kokkos::Tools::printHelp(args.tool_args)) {
+      std::cerr << "Tool has not provided a help message" << std::endl;
+    }
+    g_is_initialized = true;
+    ::Kokkos::finalize();
+    std::exit(EXIT_SUCCESS);
+  }
+  Kokkos::Tools::parseArgs(args.tool_args);
+  for (const auto& category_value : Kokkos::Impl::metadata_map) {
+    for (const auto& key_value : category_value.second) {
+      Kokkos::Tools::declareMetadata(key_value.first, key_value.second);
+    }
+  }
 }
 
+std::string version_string_from_int(int version_number) {
+  std::stringstream str_builder;
+  str_builder << version_number / 10000 << "." << (version_number % 10000) / 100
+              << "." << version_number % 100;
+  return str_builder.str();
+}
 void pre_initialize_internal(const InitArguments& args) {
   if (args.disable_warnings) g_show_warnings = false;
   if (args.tune_internals) g_tune_internals = true;
+  declare_configuration_metadata("version_info", "Kokkos Version",
+                                 version_string_from_int(KOKKOS_VERSION));
+#ifdef KOKKOS_COMPILER_APPLECC
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_APPLECC",
+                                 std::to_string(KOKKOS_COMPILER_APPLECC));
+  declare_configuration_metadata("tools_only", "compiler_family", "apple");
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_CLANG",
+                                 std::to_string(KOKKOS_COMPILER_CLANG));
+  declare_configuration_metadata("tools_only", "compiler_family", "clang");
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_CRAYC",
+                                 std::to_string(KOKKOS_COMPILER_CRAYC));
+  declare_configuration_metadata("tools_only", "compiler_family", "cray");
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_GNU",
+                                 std::to_string(KOKKOS_COMPILER_GNU));
+  declare_configuration_metadata("tools_only", "compiler_family", "gnu");
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_IBM",
+                                 std::to_string(KOKKOS_COMPILER_IBM));
+  declare_configuration_metadata("tools_only", "compiler_family", "ibm");
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_INTEL",
+                                 std::to_string(KOKKOS_COMPILER_INTEL));
+  declare_configuration_metadata("tools_only", "compiler_family", "intel");
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVCC",
+                                 std::to_string(KOKKOS_COMPILER_NVCC));
+  declare_configuration_metadata("tools_only", "compiler_family", "nvcc");
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_PGI",
+                                 std::to_string(KOKKOS_COMPILER_PGI));
+  declare_configuration_metadata("tools_only", "compiler_family", "pgi");
+#endif
+#ifdef KOKKOS_COMPILER_MSVC
+  declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_MSVC",
+                                 std::to_string(KOKKOS_COMPILER_MSVC));
+  declare_configuration_metadata("tools_only", "compiler_family", "msvc");
+#endif
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC",
+                                 "yes");
+#else
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC", "no");
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE",
+                                 "yes");
+#else
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64",
+                                 "yes");
+#else
+  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64",
+                                 "no");
+#endif
+
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "yes");
+#else
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "no");
+#endif
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS",
+                                 "yes");
+#else
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS",
+                                 "yes");
+#else
+  declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS",
+                                 "no");
+#endif
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP",
+                                 "yes");
+#else
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  declare_configuration_metadata("vectorization",
+                                 "KOKKOS_ENABLE_PRAGMA_LOOPCOUNT", "yes");
+#else
+  declare_configuration_metadata("vectorization",
+                                 "KOKKOS_ENABLE_PRAGMA_LOOPCOUNT", "no");
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_SIMD",
+                                 "yes");
+#else
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_SIMD",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_UNROLL",
+                                 "yes");
+#else
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_UNROLL",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_VECTOR",
+                                 "yes");
+#else
+  declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_VECTOR",
+                                 "no");
+#endif
+
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes");
+#else
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no");
+#endif
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC",
+                                 "yes");
+#else
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN",
+                                 "yes");
+#else
+  declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN",
+                                 "no");
+#endif
+
+#ifdef KOKKOS_ENABLE_ASM
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "no");
+#endif
+#ifdef KOKKOS_ENABLE_CXX14
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX14", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX14", "no");
+#endif
+#ifdef KOKKOS_ENABLE_CXX17
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX17", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX17", "no");
+#endif
+#ifdef KOKKOS_ENABLE_CXX20
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX20", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX20", "no");
+#endif
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK",
+                                 "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK",
+                                 "no");
+#endif
+#ifdef KOKKOS_ENABLE_HWLOC
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no");
+#endif
+#ifdef KOKKOS_ENABLE_LIBRT
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
+#endif
+#ifdef KOKKOS_ENABLE_MPI
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no");
+#endif
+  declare_configuration_metadata("architecture", "Default Device",
+                                 typeid(Kokkos::DefaultExecutionSpace).name());
 }
 
 void post_initialize_internal(const InitArguments& args) {
@@ -364,6 +601,24 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
   return true;
 }
 
+bool check_str_arg(char const* arg, char const* expected, std::string& value) {
+  if (!check_arg(arg, expected)) return false;
+  std::size_t arg_len = std::strlen(arg);
+  std::size_t exp_len = std::strlen(expected);
+  bool okay           = true;
+  if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
+  char const* remain = arg + exp_len + 1;
+  value              = remain;
+  if (!okay) {
+    std::ostringstream ss;
+    ss << "Error: expecting an '=STRING' after command line argument '"
+       << expected << "'";
+    ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
+    Impl::throw_runtime_exception(ss.str());
+  }
+  return true;
+}
+
 void warn_deprecated_command_line_argument(std::string deprecated,
                                            std::string valid) {
   std::cerr
@@ -390,6 +645,9 @@ void parse_command_line_arguments(int& narg, char* arg[],
   auto& skip_device      = arguments.skip_device;
   auto& disable_warnings = arguments.disable_warnings;
   auto& tune_internals   = arguments.tune_internals;
+  auto& tool_help        = arguments.tool_help;
+  auto& tool_args        = arguments.tool_args;
+  auto& tool_lib         = arguments.tool_lib;
 
   bool kokkos_threads_found  = false;
   bool kokkos_numa_found     = false;
@@ -461,7 +719,7 @@ void parse_command_line_arguments(int& narg, char* arg[],
       int num1_len    = num2 == nullptr ? strlen(num1) : num2 - num1;
       char* num1_only = new char[num1_len + 1];
       strncpy(num1_only, num1, num1_len);
-      num1_only[num1_len] = 0;
+      num1_only[num1_len] = '\0';
 
       if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) {
         throw_runtime_exception(
@@ -510,6 +768,37 @@ void parse_command_line_arguments(int& narg, char* arg[],
         arg[k] = arg[k + 1];
       }
       narg--;
+    } else if (check_str_arg(arg[iarg], "--kokkos-tools-library", tool_lib)) {
+      for (int k = iarg; k < narg - 1; k++) {
+        arg[k] = arg[k + 1];
+      }
+      narg--;
+    } else if (check_str_arg(arg[iarg], "--kokkos-tools-args", tool_args)) {
+      for (int k = iarg; k < narg - 1; k++) {
+        arg[k] = arg[k + 1];
+      }
+      narg--;
+      // strip any leading and/or trailing quotes if they were retained in the
+      // string because this will very likely cause parsing issues for tools.
+      // If the quotes are retained (via bypassing the shell):
+      //    <EXE> --kokkos-tools-args="-c my example"
+      // would be tokenized as:
+      //    "<EXE>" "\"-c" "my" "example\""
+      // instead of:
+      //    "<EXE>" "-c" "my" "example"
+      if (!tool_args.empty()) {
+        if (tool_args.front() == '"') tool_args = tool_args.substr(1);
+        if (tool_args.back() == '"')
+          tool_args = tool_args.substr(0, tool_args.length() - 1);
+      }
+      // add the name of the executable to the beginning
+      if (narg > 0) tool_args = std::string(arg[0]) + " " + tool_args;
+    } else if (check_arg(arg[iarg], "--kokkos-tools-help")) {
+      tool_help = true;
+      for (int k = iarg; k < narg - 1; k++) {
+        arg[k] = arg[k + 1];
+      }
+      narg--;
     } else if (check_arg(arg[iarg], "--kokkos-help") ||
                check_arg(arg[iarg], "--help")) {
       auto const help_message = R"(
@@ -526,7 +815,7 @@ void parse_command_line_arguments(int& narg, char* arg[],
       --kokkos-disable-warnings      : disable kokkos warning messages
       --kokkos-tune-internals        : allow Kokkos to autotune policies and declare
                                        tuning features through the tuning system. If
-				         left off, Kokkos uses heuristics
+                                       left off, Kokkos uses heuristics
       --kokkos-threads=INT           : specify total number of threads or
                                        number of threads per NUMA region if
                                        used in conjunction with '--numa' option.
@@ -540,6 +829,18 @@ void parse_command_line_arguments(int& narg, char* arg[],
                                        to be ignored. This is most useful on workstations
                                        with multiple GPUs of which one is used to drive
                                        screen output.
+      --kokkos-tools-library         : Equivalent to KOKKOS_PROFILE_LIBRARY environment
+                                       variable. Must either be full path to library or
+                                       name of library if the path is present in the
+                                       runtime library search path (e.g. LD_LIBRARY_PATH)
+      --kokkos-tools-help            : Query the (loaded) kokkos-tool for its command-line
+                                       option support (which should then be passed via
+                                       --kokkos-tools-args="...")
+      --kokkos-tools-args=STR        : A single (quoted) string of options which will be
+                                       whitespace delimited and passed to the loaded
+                                       kokkos-tool as command-line arguments. E.g.
+                                       `<EXE> --kokkos-tools-args="-c input.txt"` will
+                                       pass `<EXE> -c input.txt` as argc/argv to tool
       --------------------------------------------------------------------------------
 )";
       std::cout << help_message << std::endl;
@@ -556,6 +857,7 @@ void parse_command_line_arguments(int& narg, char* arg[],
     } else
       iarg++;
   }
+  if (tool_args.empty() && narg > 0) tool_args = arg[0];
 }
 
 void parse_environment_variables(InitArguments& arguments) {
@@ -566,6 +868,7 @@ void parse_environment_variables(InitArguments& arguments) {
   auto& skip_device      = arguments.skip_device;
   auto& disable_warnings = arguments.disable_warnings;
   auto& tune_internals   = arguments.tune_internals;
+  auto& tool_lib         = arguments.tool_lib;
   char* endptr;
   auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS");
   if (env_num_threads_str != nullptr) {
@@ -711,7 +1014,9 @@ void parse_environment_variables(InitArguments& arguments) {
     for (char& c : env_str) {
       c = toupper(c);
     }
-    if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1"))
+    const auto _rc = std::regex_constants::icase | std::regex_constants::egrep;
+    const auto _re = std::regex("^(true|on|yes|[1-9])$", _rc);
+    if (std::regex_match(env_str, _re))
       disable_warnings = true;
     else if (disable_warnings)
       Impl::throw_runtime_exception(
@@ -733,6 +1038,16 @@ void parse_environment_variables(InitArguments& arguments) {
           "KOKKOS_TUNE_INTERNALS if both are set. Raised by "
           "Kokkos::initialize(int narg, char* argc[]).");
   }
+  auto env_tool_lib = std::getenv("KOKKOS_PROFILE_LIBRARY");
+  if (env_tool_lib != nullptr) {
+    if (!tool_lib.empty() && std::string(env_tool_lib) != tool_lib)
+      Impl::throw_runtime_exception(
+          "Error: expecting a match between --kokkos-tools-library and "
+          "KOKKOS_PROFILE_LIBRARY if both are set. Raised by "
+          "Kokkos::initialize(int narg, char* argc[]).");
+    else
+      tool_lib = env_tool_lib;
+  }
 }
 
 }  // namespace
@@ -765,6 +1080,7 @@ void pre_initialize(const InitArguments& args) {
 void post_initialize(const InitArguments& args) {
   post_initialize_internal(args);
 }
+
 }  // namespace Impl
 
 void push_finalize_hook(std::function<void()> f) { finalize_hooks.push(f); }
@@ -778,180 +1094,35 @@ void finalize_all() {
 
 void fence() { Impl::fence_internal(); }
 
+void print_helper(std::ostringstream& out,
+                  const std::map<std::string, std::string>& print_me) {
+  for (const auto& kv : print_me) {
+    out << kv.first << ": " << kv.second << '\n';
+  }
+}
+
 void print_configuration(std::ostream& out, const bool detail) {
   std::ostringstream msg;
 
-  msg << "Kokkos Version:" << std::endl;
-  msg << "  " << KOKKOS_VERSION / 10000 << "." << (KOKKOS_VERSION % 10000) / 100
-      << "." << KOKKOS_VERSION % 100 << std::endl;
+  print_helper(msg, Kokkos::Impl::metadata_map["version_info"]);
 
   msg << "Compiler:" << std::endl;
-#ifdef KOKKOS_COMPILER_APPLECC
-  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_CLANG
-  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_CRAYC
-  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_GNU
-  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_IBM
-  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_INTEL
-  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_NVCC
-  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
-#endif
-#ifdef KOKKOS_COMPILER_PGI
-  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
-#endif
+  print_helper(msg, Kokkos::Impl::metadata_map["compiler_version"]);
 
   msg << "Architecture:" << std::endl;
-#ifdef KOKKOS_ENABLE_ISA_KNC
-  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
-#else
-  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
-#endif
-#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
-  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
-#else
-  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
-#endif
-#ifdef KOKKOS_ENABLE_ISA_X86_64
-  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
-#else
-  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
-#endif
-
-  msg << "Default Device:" << typeid(Kokkos::DefaultExecutionSpace).name()
-      << std::endl;
+  print_helper(msg, Kokkos::Impl::metadata_map["architecture"]);
 
   msg << "Atomics:" << std::endl;
-  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_GNU_ATOMICS
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
+  print_helper(msg, Kokkos::Impl::metadata_map["atomics"]);
 
   msg << "Vectorization:" << std::endl;
-  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
-#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
+  print_helper(msg, Kokkos::Impl::metadata_map["vectorization"]);
 
   msg << "Memory:" << std::endl;
-  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
-#ifdef KOKKOS_ENABLE_HBWSPACE
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
-#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
-#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
+  print_helper(msg, Kokkos::Impl::metadata_map["memory"]);
 
   msg << "Options:" << std::endl;
-  msg << "  KOKKOS_ENABLE_ASM: ";
-#ifdef KOKKOS_ENABLE_ASM
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_CXX14: ";
-#ifdef KOKKOS_ENABLE_CXX14
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_CXX17: ";
-#ifdef KOKKOS_ENABLE_CXX17
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_CXX20: ";
-#ifdef KOKKOS_ENABLE_CXX20
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
-#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_HWLOC: ";
-#ifdef KOKKOS_ENABLE_HWLOC
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_LIBRT: ";
-#ifdef KOKKOS_ENABLE_LIBRT
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-  msg << "  KOKKOS_ENABLE_MPI: ";
-#ifdef KOKKOS_ENABLE_MPI
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
+  print_helper(msg, Kokkos::Impl::metadata_map["options"]);
 
   Impl::ExecSpaceManager::get_instance().print_configuration(msg, detail);
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
index 6362487ed7eab223c8f2dff445d79c7eaada5646..dfb9f3a51cdbd9aa7e189e21f5956806d53823b5 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -132,8 +132,11 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::CudaHostAlloc: o << "cudaHostAlloc()."; break;
     case AllocationMechanism::HIPMalloc: o << "hipMalloc()."; break;
     case AllocationMechanism::HIPHostMalloc: o << "hipHostMalloc()."; break;
-    case AllocationMechanism::SYCLMalloc:
-      o << "cl::sycl::malloc_device().";
+    case AllocationMechanism::SYCLMallocDevice:
+      o << "sycl::malloc_device().";
+      break;
+    case AllocationMechanism::SYCLMallocShared:
+      o << "sycl::malloc_shared().";
       break;
   }
   append_additional_error_information(o);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
index ab966a4d4a8bc8806ea0fb1c4c0f75b9a2143d2b..5db459734631ddff5d0a29963a9ec04b9ec549ea 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -54,6 +54,9 @@
 #ifdef KOKKOS_ENABLE_HIP
 #include <HIP/Kokkos_HIP_Abort.hpp>
 #endif
+#ifdef KOKKOS_ENABLE_SYCL
+#include <SYCL/Kokkos_SYCL_Abort.hpp>
+#endif
 
 #ifndef KOKKOS_ABORT_MESSAGE_BUFFER_SIZE
 #define KOKKOS_ABORT_MESSAGE_BUFFER_SIZE 2048
@@ -93,7 +96,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     CudaHostAlloc,
     HIPMalloc,
     HIPHostMalloc,
-    SYCLMalloc
+    SYCLMallocDevice,
+    SYCLMallocShared
   };
 
  private:
@@ -180,7 +184,10 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
 #elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
 // HIP aborts
 #define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__SYCL_DEVICE_ONLY__)
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// FIXME_SYCL SYCL doesn't abort
+#define KOKKOS_IMPL_ABORT_NORETURN
+#elif !defined(KOKKOS_ENABLE_OPENMPTARGET)
 // Host aborts
 #define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
 #else
@@ -195,10 +202,12 @@ KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
   Kokkos::Impl::cuda_abort(message);
 #elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
   Kokkos::Impl::hip_abort(message);
-#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__SYCL_DEVICE_ONLY__)
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+  Kokkos::Impl::sycl_abort(message);
+#elif !defined(KOKKOS_ENABLE_OPENMPTARGET)
   Kokkos::Impl::host_abort(message);
 #else
-  (void)message;  // FIXME_OPENMPTARGET, FIXME_SYCL
+  (void)message;  // FIXME_OPENMPTARGET
 #endif
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp
index 2651229a706038fe3b8cfe8033bd4d521675003e..3068ef3db0389d48149d2d9ce28efac3112f1c27 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp
@@ -104,7 +104,7 @@ class FixedBlockSizeMemoryPool
     m_first_block = (Block*)block_record->data();
 
     auto idx_record =
-        record_type::allocate(mem_space, "FixedBlockSizeMemPool_blocks",
+        record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks",
                               num_blocks * sizeof(size_type));
     KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0);
     m_tracker.assign_allocated_record_to_uninitialized(idx_record);
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 3bcb60f8561d15431b2079824d6d4f4ae5d86df9..22e88ebc4fc57d4e7132bca0be2aa55f5bfc5f69 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -84,16 +84,6 @@ struct ReduceFunctorHasInit<
     typename std::enable_if<0 < sizeof(&FunctorType::init)>::type> {
   enum : bool { value = true };
 };
-// FIXME_SYCL not all compilers distinguish between the FunctorType::init and
-// the FunctorType::template init<> specialization
-#ifdef KOKKOS_ENABLE_SYCL
-template <class FunctorType>
-struct ReduceFunctorHasInit<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::template init<>)>::type> {
-  enum : bool { value = true };
-};
-#endif
 #endif
 
 template <class FunctorType, class Enable = void>
@@ -117,16 +107,6 @@ struct ReduceFunctorHasJoin<
     typename std::enable_if<0 < sizeof(&FunctorType::join)>::type> {
   enum : bool { value = true };
 };
-// FIXME_SYCL not all compilers distinguish between the FunctorType::join and
-// the FunctorType::template join<> specialization
-#ifdef KOKKOS_ENABLE_SYCL
-template <class FunctorType>
-struct ReduceFunctorHasJoin<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::template join<>)>::type> {
-  enum : bool { value = true };
-};
-#endif
 #endif
 
 template <class FunctorType, class Enable = void>
@@ -150,16 +130,6 @@ struct ReduceFunctorHasFinal<
     typename std::enable_if<0 < sizeof(&FunctorType::final)>::type> {
   enum : bool { value = true };
 };
-// FIXME_SYCL not all compilers distinguish between the FunctorType::final and
-// the FunctorType::template final<> specialization
-#ifdef KOKKOS_ENABLE_SYCL
-template <class FunctorType>
-struct ReduceFunctorHasFinal<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::template final<>)>::type> {
-  enum : bool { value = true };
-};
-#endif
 #endif
 
 template <class FunctorType, class Enable = void>
@@ -183,27 +153,14 @@ struct ReduceFunctorHasShmemSize<
     typename std::enable_if<0 < sizeof(&FunctorType::team_shmem_size)>::type> {
   enum : bool { value = true };
 };
-// FIXME_SYCL not all compilers distinguish between the
-// FunctorType::team_shmem_size and the FunctorType::template team_shmem_size<>
-// specialization
-#ifdef KOKKOS_ENABLE_SYCL
-template <class FunctorType>
-struct ReduceFunctorHasShmemSize<
-    FunctorType,
-    typename std::enable_if<
-        0 < sizeof(&FunctorType::template team_shmem_size<>)>::type> {
-  enum : bool { value = true };
-};
-#endif
 #endif
 
 template <class FunctorType, class ArgTag, class Enable = void>
 struct FunctorDeclaresValueType : public std::false_type {};
 
 template <class FunctorType, class ArgTag>
-struct FunctorDeclaresValueType<
-    FunctorType, ArgTag,
-    typename Impl::enable_if_type<typename FunctorType::value_type>::type>
+struct FunctorDeclaresValueType<FunctorType, ArgTag,
+                                void_t<typename FunctorType::value_type>>
     : public std::true_type {};
 
 template <class FunctorType,
@@ -290,8 +247,7 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   // The reference_type for an array is 'value_type *'
   // The reference_type for a single value is 'value_type &'
 
-  using reference_type =
-      typename Impl::if_c<IsArray, value_type*, value_type&>::type;
+  using reference_type = std::conditional_t<IsArray, value_type*, value_type&>;
 
   // Number of values if single value
   template <class F>
@@ -329,8 +285,8 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   struct REJECTTAG {
   };  // Reject tagged operator() when using non-tagged execution policy.
 
-  using tag_type = typename Impl::if_c<std::is_same<ArgTag, void>::value,
-                                       VOIDTAG, ArgTag>::type;
+  using tag_type =
+      std::conditional_t<std::is_same<ArgTag, void>::value, VOIDTAG, ArgTag>;
 
   //----------------------------------------
   // parallel_for operator without a tag:
@@ -1371,12 +1327,11 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   enum { IS_REJECT = std::is_same<REJECTTAG, ValueType>::value };
 
  public:
-  using value_type =
-      typename Impl::if_c<IS_VOID || IS_REJECT, void, ValueType>::type;
+  using value_type = std::conditional_t<IS_VOID || IS_REJECT, void, ValueType>;
   using pointer_type =
-      typename Impl::if_c<IS_VOID || IS_REJECT, void, ValueType*>::type;
+      std::conditional_t<IS_VOID || IS_REJECT, void, ValueType*>;
   using reference_type =
-      typename Impl::if_c<IS_VOID || IS_REJECT, void, ValueType&>::type;
+      std::conditional_t<IS_VOID || IS_REJECT, void, ValueType&>;
   using functor_type = FunctorType;
 
   static_assert(
@@ -2080,43 +2035,71 @@ struct FunctorFinal {
   KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType&, void*) {}
 };
 
-/* 'final' function provided */
+/* 'final' function provided for single value but no tag*/
 template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<FunctorType, ArgTag,
-                    T&
-                    // First  substitution failure when FunctorType::final does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::final ) does not exist
-                    ,
-                    decltype(
-                        FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::final))> {
+struct FunctorFinal<
+    FunctorType, ArgTag,
+    T&
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when FunctorType::final is not compatible.
+    ,
+    typename std::enable_if<
+        std::is_same<ArgTag, void>::value,
+        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
+            &FunctorType::final))>::type> {
   KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
     f.final(*((T*)p));
   }
+};
 
-  KOKKOS_FORCEINLINE_FUNCTION static void final(FunctorType& f, void* p) {
-    f.final(*((T*)p));
+/* 'final' function provided for array value but no tag*/
+template <class FunctorType, class ArgTag, class T>
+struct FunctorFinal<
+    FunctorType, ArgTag,
+    T*
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when FunctorType::final is not compatible.
+    ,
+    typename std::enable_if<
+        std::is_same<ArgTag, void>::value,
+        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
+            &FunctorType::final))>::type> {
+  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
+    f.final((T*)p);
   }
 };
 
-/* 'final' function provided for array value */
+/* 'final' function provided for single value and with tag */
 template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<FunctorType, ArgTag,
-                    T*
-                    // First  substitution failure when FunctorType::final does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::final ) does not exist
-                    ,
-                    decltype(
-                        FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::final))> {
+struct FunctorFinal<
+    FunctorType, ArgTag,
+    T&
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when FunctorType::final is not compatible.
+    ,
+    typename std::enable_if<
+        !std::is_same<ArgTag, void>::value,
+        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
+            &FunctorType::final))>::type> {
   KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
-    f.final((T*)p);
+    f.final(ArgTag(), *((T*)p));
   }
+};
 
-  KOKKOS_FORCEINLINE_FUNCTION static void final(FunctorType& f, void* p) {
-    f.final((T*)p);
+/* 'final' function provided for array value and with tag */
+template <class FunctorType, class ArgTag, class T>
+struct FunctorFinal<
+    FunctorType, ArgTag,
+    T*
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when FunctorType::final is not compatible.
+    ,
+    typename std::enable_if<
+        !std::is_same<ArgTag, void>::value,
+        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
+            &FunctorType::final))>::type> {
+  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
+    f.final(ArgTag(), (T*)p);
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index aeebcb64ee42b14f40465f611ec1982917904084..5c0eaa0a1ef80fa02e2f745f1d7e53d6fc45b8d3 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -250,10 +250,10 @@ SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::
       static_cast<SharedAllocationRecord<void, void> *>(this);
 
   strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
+          SharedAllocationHeader::maximum_label_length - 1);
   // Set last element zero, in case c_str is too long
   RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
+      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
 }
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
index 55d70985dcd1f921d4082b507cf84d1044ad8fbb..79ee7e80db3115f1c9c14366e2c237c042ab0bdb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
@@ -49,10 +49,8 @@
 
 #include <impl/Kokkos_HostBarrier.hpp>
 
-#if !defined(_WIN32)
-#include <sched.h>
-#include <time.h>
-#else
+#include <thread>
+#if defined(_WIN32)
 #include <process.h>
 #include <winsock2.h>
 #include <windows.h>
@@ -63,18 +61,15 @@ namespace Impl {
 
 void HostBarrier::impl_backoff_wait_until_equal(
     int* ptr, const int v, const bool active_wait) noexcept {
-#if !defined(_WIN32)
-  timespec req;
-  req.tv_sec     = 0;
   unsigned count = 0u;
 
   while (!test_equal(ptr, v)) {
     const int c = ::Kokkos::log2(++count);
     if (!active_wait || c > log2_iterations_till_sleep) {
-      req.tv_nsec = c < 16 ? 256 * c : 4096;
-      nanosleep(&req, nullptr);
+      std::this_thread::sleep_for(
+          std::chrono::nanoseconds(c < 16 ? 256 * c : 4096));
     } else if (c > log2_iterations_till_yield) {
-      sched_yield();
+      std::this_thread::yield();
     }
 #if defined(KOKKOS_ENABLE_ASM)
 #if defined(__PPC64__)
@@ -91,18 +86,6 @@ void HostBarrier::impl_backoff_wait_until_equal(
 #endif
 #endif
   }
-#else  // _WIN32
-  while (!test_equal(ptr, v)) {
-#if defined(KOKKOS_ENABLE_ASM)
-    for (int j = 0; j < num_nops; ++j) {
-      __asm__ __volatile__("nop\n");
-    }
-    __asm__ __volatile__("pause\n" ::: "memory");
-#endif
-  }
-#endif
-  // printf("W: %d\n", count);
 }
-
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..97286dd07f4ea2ee94f3070768f425e2ef5b7896
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_HOST_SHARED_PTR_HPP
+#define KOKKOS_IMPL_HOST_SHARED_PTR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+#include <functional>
+
+namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+class HostSharedPtr {
+ public:
+  using element_type = T;
+
+  KOKKOS_DEFAULTED_FUNCTION constexpr HostSharedPtr() = default;
+  KOKKOS_FUNCTION constexpr HostSharedPtr(std::nullptr_t) {}
+
+  explicit HostSharedPtr(T* element_ptr)
+      : HostSharedPtr(element_ptr, [](T* const t) { delete t; }) {}
+
+  template <class Deleter>
+  HostSharedPtr(T* element_ptr, const Deleter& deleter)
+      : m_element_ptr(element_ptr) {
+#ifdef KOKKOS_ENABLE_CXX17
+    static_assert(std::is_invocable_v<Deleter, T*> &&
+                  std::is_copy_constructible_v<Deleter>);
+#endif
+    if (element_ptr) {
+      try {
+        m_control = new Control{deleter, 1};
+      } catch (...) {
+        deleter(element_ptr);
+        throw;
+      }
+    }
+  }
+
+  KOKKOS_FUNCTION HostSharedPtr(HostSharedPtr&& other) noexcept
+      : m_element_ptr(other.m_element_ptr), m_control(other.m_control) {
+    other.m_element_ptr = nullptr;
+    other.m_control     = nullptr;
+  }
+
+  KOKKOS_FUNCTION HostSharedPtr(const HostSharedPtr& other) noexcept
+      : m_element_ptr(other.m_element_ptr), m_control(other.m_control) {
+    // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#endif
+  }
+
+  KOKKOS_FUNCTION HostSharedPtr& operator=(HostSharedPtr&& other) noexcept {
+    if (&other != this) {
+      cleanup();
+      m_element_ptr       = other.m_element_ptr;
+      other.m_element_ptr = nullptr;
+      m_control           = other.m_control;
+      other.m_control     = nullptr;
+    }
+    return *this;
+  }
+
+  KOKKOS_FUNCTION HostSharedPtr& operator=(
+      const HostSharedPtr& other) noexcept {
+    if (&other != this) {
+      cleanup();
+      m_element_ptr = other.m_element_ptr;
+      m_control     = other.m_control;
+      // FIXME_OPENMPTARGET
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#endif
+    }
+    return *this;
+  }
+
+  KOKKOS_FUNCTION ~HostSharedPtr() { cleanup(); }
+
+  // returns the stored pointer
+  KOKKOS_FUNCTION T* get() const noexcept { return m_element_ptr; }
+  // dereferences the stored pointer
+  KOKKOS_FUNCTION T& operator*() const noexcept {
+    KOKKOS_EXPECTS(bool(*this));
+    return *get();
+  }
+  // dereferences the stored pointer
+  KOKKOS_FUNCTION T* operator->() const noexcept {
+    KOKKOS_EXPECTS(bool(*this));
+    return get();
+  }
+
+  // checks if the stored pointer is not null
+  KOKKOS_FUNCTION explicit operator bool() const noexcept {
+    return get() != nullptr;
+  }
+
+  // returns the number of HostSharedPtr instances managing the curent object or
+  // 0 if there is no managed object.
+  int use_count() const noexcept {
+    return m_control ? m_control->m_counter : 0;
+  }
+
+ private:
+  KOKKOS_FUNCTION void cleanup() noexcept {
+    // FIXME_OPENMPTARGET
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    // If m_counter is set, then this instance is responsible for managing the
+    // object pointed to by m_counter and m_element_ptr.
+    if (m_control) {
+      int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1);
+      if (count == 1) {
+        (m_control->m_deleter)(m_element_ptr);
+        m_element_ptr = nullptr;
+        delete m_control;
+        m_control = nullptr;
+      }
+    }
+#endif
+  }
+
+  struct Control {
+    std::function<void(T*)> m_deleter;
+    int m_counter;
+  };
+
+  T* m_element_ptr   = nullptr;
+  Control* m_control = nullptr;
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 130e5cce13a3cd04e1bda20339d56f9b57764c3e..ed46d170e53ebb58e118c8d020073ed12d3c1064 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -42,9 +42,8 @@
 //@HEADER
 */
 
-#include <cstdio>
-#include <algorithm>
 #include <Kokkos_Macros.hpp>
+
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
 #include <impl/Kokkos_Tools.hpp>
@@ -352,18 +351,12 @@ SharedAllocationRecord<void, void>
     SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record;
 #endif
 
-void SharedAllocationRecord<Kokkos::HostSpace, void>::deallocate(
-    SharedAllocationRecord<void, void> *arg_rec) {
-  delete static_cast<SharedAllocationRecord *>(arg_rec);
-}
-
 SharedAllocationRecord<Kokkos::HostSpace, void>::~SharedAllocationRecord()
 #if defined( \
     KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION)
     noexcept
 #endif
 {
-
   m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
                      SharedAllocationRecord<void, void>::m_alloc_ptr,
                      SharedAllocationRecord<void, void>::m_alloc_size,
@@ -399,7 +392,7 @@ SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord(
     const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
-    : SharedAllocationRecord<void, void>(
+    : base_t(
 #ifdef KOKKOS_ENABLE_DEBUG
           &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record,
 #endif
@@ -407,91 +400,10 @@ SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord(
                                                arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
       m_space(arg_space) {
-  // Fill in the Header information
-  RecordBase::m_alloc_ptr->m_record =
-      static_cast<SharedAllocationRecord<void, void> *>(this);
-
-  strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(),
-          SharedAllocationHeader::maximum_label_length);
-  // Set last element zero, in case c_str is too long
-  RecordBase::m_alloc_ptr
-      ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0;
-}
-
-//----------------------------------------------------------------------------
-
-void *SharedAllocationRecord<Kokkos::HostSpace, void>::allocate_tracked(
-    const Kokkos::HostSpace &arg_space, const std::string &arg_alloc_label,
-    const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return nullptr;
-
-  SharedAllocationRecord *const r =
-      allocate(arg_space, arg_alloc_label, arg_alloc_size);
-
-  RecordBase::increment(r);
-
-  return r->data();
-}
-
-void SharedAllocationRecord<Kokkos::HostSpace, void>::deallocate_tracked(
-    void *const arg_alloc_ptr) {
-  if (arg_alloc_ptr != nullptr) {
-    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);
-
-    RecordBase::decrement(r);
-  }
-}
-
-void *SharedAllocationRecord<Kokkos::HostSpace, void>::reallocate_tracked(
-    void *const arg_alloc_ptr, const size_t arg_alloc_size) {
-  SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr);
-  SharedAllocationRecord *const r_new =
-      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
-
-  Kokkos::Impl::DeepCopy<HostSpace, HostSpace>(
-      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
-
-  RecordBase::increment(r_new);
-  RecordBase::decrement(r_old);
-
-  return r_new->data();
+  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
+                                                  arg_label);
 }
 
-SharedAllocationRecord<Kokkos::HostSpace, void> *
-SharedAllocationRecord<Kokkos::HostSpace, void>::get_record(void *alloc_ptr) {
-  using Header     = SharedAllocationHeader;
-  using RecordHost = SharedAllocationRecord<Kokkos::HostSpace, void>;
-
-  SharedAllocationHeader const *const head =
-      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
-  RecordHost *const record =
-      head ? static_cast<RecordHost *>(head->m_record) : nullptr;
-
-  if (!alloc_ptr || record->m_alloc_ptr != head) {
-    Kokkos::Impl::throw_runtime_exception(
-        std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , "
-                    "void >::get_record ERROR"));
-  }
-
-  return record;
-}
-
-// Iterate records to print orphaned memory ...
-#ifdef KOKKOS_ENABLE_DEBUG
-void SharedAllocationRecord<Kokkos::HostSpace, void>::print_records(
-    std::ostream &s, const Kokkos::HostSpace &, bool detail) {
-  SharedAllocationRecord<void, void>::print_host_accessible_records(
-      s, "HostSpace", &s_root_record, detail);
-}
-#else
-void SharedAllocationRecord<Kokkos::HostSpace, void>::print_records(
-    std::ostream &, const Kokkos::HostSpace &, bool) {
-  throw_runtime_exception(
-      "SharedAllocationRecord<HostSpace>::print_records only works with "
-      "KOKKOS_ENABLE_DEBUG enabled");
-}
-#endif
-
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -568,3 +480,22 @@ void unlock_address_host_space(void *ptr) {
 
 }  // namespace Impl
 }  // namespace Kokkos
+
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicity instantiate these CRTP base classes here,
+// where we have access to the associated *_timpl.hpp header files.
+template class SharedAllocationRecordCommon<Kokkos::HostSpace>;
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index 2480967ebd18b477c5193857e73040f707b7b435..d4cae7f122ed182cf88522d5d60729a0906cce5b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -813,14 +813,16 @@ ThreadVectorRange(
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(member, count);
 }
 
-template <typename iType, typename Member>
-KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType, Member>
+template <typename iType1, typename iType2, typename Member>
+KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
+    typename std::common_type<iType1, iType2>::type, Member>
 ThreadVectorRange(
-    Member const& member, iType arg_begin, iType arg_end,
+    Member const& member, iType1 arg_begin, iType2 arg_end,
     typename std::enable_if<
         Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+  using iType = typename std::common_type<iType1, iType2>::type;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(
-      member, arg_begin, arg_end);
+      member, iType(arg_begin), iType(arg_end));
 }
 
 //----------------------------------------------------------------------------
@@ -1010,6 +1012,25 @@ parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
   }
 }
 
+template <typename iType, class Lambda, typename ReducerType, typename Member>
+KOKKOS_INLINE_FUNCTION typename std::enable_if<
+    Kokkos::is_reducer<ReducerType>::value &&
+    Impl::is_host_thread_team_member<Member>::value>::type
+parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
+                  loop_boundaries,
+              const Lambda& lambda, const ReducerType& reducer) {
+  typename ReducerType::value_type scan_val;
+  reducer.init(scan_val);
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
+       i += loop_boundaries.increment) {
+    lambda(i, scan_val, true);
+  }
+}
+
 //----------------------------------------------------------------------------
 
 template <class Member>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 262aa9e3ea1d6e3f32993f0ebc7b101f982633b7..76d553601923fd7282132fbff05ce69a4e576e97 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -57,6 +57,9 @@ void memory_fence() {
 #pragma omp flush
 #elif defined(__HIP_DEVICE_COMPILE__)
   __threadfence();
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+  sycl::ONEAPI::atomic_fence(sycl::ONEAPI::memory_order::acq_rel,
+                             sycl::ONEAPI::memory_scope::device);
 #elif defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
   asm volatile("mfence" ::: "memory");
 #elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || \
diff --git a/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e53afe436daff997726be8cb0c880887c32de1a4
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp
@@ -0,0 +1,73 @@
+#include <Kokkos_NumericTraits.hpp>
+
+// NOTE These out-of class definitions are only required with C++14.  Since
+// C++17, a static data member declared constrexpr is impllictly inline.
+
+#if !defined(KOKKOS_ENABLE_CXX17)
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+#define OUT_OF_CLASS_DEFINTION_FLOATING_POINT(TRAIT) \
+  constexpr float TRAIT##_helper<float>::value;      \
+  constexpr double TRAIT##_helper<double>::value;    \
+  constexpr long double TRAIT##_helper<long double>::value
+
+#define OUT_OF_CLASS_DEFINTION_INTEGRAL(TRAIT)                          \
+  constexpr bool TRAIT##_helper<bool>::value;                           \
+  constexpr char TRAIT##_helper<char>::value;                           \
+  constexpr signed char TRAIT##_helper<signed char>::value;             \
+  constexpr unsigned char TRAIT##_helper<unsigned char>::value;         \
+  constexpr short TRAIT##_helper<short>::value;                         \
+  constexpr unsigned short TRAIT##_helper<unsigned short>::value;       \
+  constexpr int TRAIT##_helper<int>::value;                             \
+  constexpr unsigned int TRAIT##_helper<unsigned int>::value;           \
+  constexpr long int TRAIT##_helper<long int>::value;                   \
+  constexpr unsigned long int TRAIT##_helper<unsigned long int>::value; \
+  constexpr long long int TRAIT##_helper<long long int>::value;         \
+  constexpr unsigned long long int TRAIT##_helper<unsigned long long int>::value
+
+#define OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(TRAIT) \
+  constexpr int TRAIT##_helper<float>::value;          \
+  constexpr int TRAIT##_helper<double>::value;         \
+  constexpr int TRAIT##_helper<long double>::value
+
+#define OUT_OF_CLASS_DEFINTION_INTEGRAL_2(TRAIT)          \
+  constexpr int TRAIT##_helper<bool>::value;              \
+  constexpr int TRAIT##_helper<char>::value;              \
+  constexpr int TRAIT##_helper<signed char>::value;       \
+  constexpr int TRAIT##_helper<unsigned char>::value;     \
+  constexpr int TRAIT##_helper<short>::value;             \
+  constexpr int TRAIT##_helper<unsigned short>::value;    \
+  constexpr int TRAIT##_helper<int>::value;               \
+  constexpr int TRAIT##_helper<unsigned int>::value;      \
+  constexpr int TRAIT##_helper<long int>::value;          \
+  constexpr int TRAIT##_helper<unsigned long int>::value; \
+  constexpr int TRAIT##_helper<long long int>::value;     \
+  constexpr int TRAIT##_helper<unsigned long long int>::value
+
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(infinity);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(epsilon);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(round_error);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(norm_min);
+
+OUT_OF_CLASS_DEFINTION_INTEGRAL(finite_min);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(finite_min);
+OUT_OF_CLASS_DEFINTION_INTEGRAL(finite_max);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT(finite_max);
+
+OUT_OF_CLASS_DEFINTION_INTEGRAL_2(digits);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(digits);
+OUT_OF_CLASS_DEFINTION_INTEGRAL_2(digits10);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(digits10);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_digits10);
+OUT_OF_CLASS_DEFINTION_INTEGRAL_2(radix);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(radix);
+
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(min_exponent);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(min_exponent10);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_exponent);
+OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_exponent10);
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 4b7e02bbb2a0d96e5e1a852feccfefd2bf603274..94ea6e1a2b10c33a81e4f2c6b7a932577ce6144b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -49,19 +49,30 @@
 #include <dlfcn.h>
 #endif
 
+#include <algorithm>
+#include <array>
 #include <cstring>
+#include <iostream>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
-#include <algorithm>
 #include <vector>
-#include <array>
-#include <stack>
-#include <iostream>
 namespace Kokkos {
 
 namespace Tools {
 
 namespace Experimental {
+
+namespace Impl {
+void tool_invoked_fence(const uint32_t /* devID */) {
+  /**
+   * Currently the function ignores the device ID,
+   * Eventually we want to support fencing only
+   * a given stream/resource
+   */
+  Kokkos::fence();
+}
+}  // namespace Impl
 #ifdef KOKKOS_ENABLE_TUNING
 static size_t kernel_name_context_variable_id;
 static size_t kernel_type_context_variable_id;
@@ -74,9 +85,10 @@ static std::unordered_map<size_t, VariableInfo> variable_metadata;
 static EventSet current_callbacks;
 static EventSet backup_callbacks;
 static EventSet no_profiling;
-
+static Kokkos::Tools::Experimental::ToolSettings tool_requirements;
 bool eventSetsEqual(const EventSet& l, const EventSet& r) {
   return l.init == r.init && l.finalize == r.finalize &&
+         l.parse_args == r.parse_args && l.print_help == r.print_help &&
          l.begin_parallel_for == r.begin_parallel_for &&
          l.end_parallel_for == r.end_parallel_for &&
          l.begin_parallel_reduce == r.begin_parallel_reduce &&
@@ -95,6 +107,10 @@ bool eventSetsEqual(const EventSet& l, const EventSet& r) {
          l.end_deep_copy == r.end_deep_copy && l.begin_fence == r.begin_fence &&
          l.end_fence == r.end_fence && l.sync_dual_view == r.sync_dual_view &&
          l.modify_dual_view == r.modify_dual_view &&
+         l.declare_metadata == r.declare_metadata &&
+         l.request_tool_settings == r.request_tool_settings &&
+         l.provide_tool_programming_interface ==
+             r.provide_tool_programming_interface &&
          l.declare_input_type == r.declare_input_type &&
          l.declare_output_type == r.declare_output_type &&
          l.end_tuning_context == r.end_tuning_context &&
@@ -102,6 +118,24 @@ bool eventSetsEqual(const EventSet& l, const EventSet& r) {
          l.request_output_values == r.request_output_values &&
          l.declare_optimization_goal == r.declare_optimization_goal;
 }
+enum class MayRequireGlobalFencing : bool { No, Yes };
+template <typename Callback, typename... Args>
+inline void invoke_kokkosp_callback(
+    MayRequireGlobalFencing may_require_global_fencing,
+    const Callback& callback, Args&&... args) {
+  if (callback != nullptr) {
+    // two clause if statement
+    // may_require_global_fencing: "if this callback ever needs a fence", AND
+    // if the tool requires global fencing (default true, but tools can
+    // overwrite)
+    if (may_require_global_fencing == MayRequireGlobalFencing::Yes &&
+        (Kokkos::Tools::Experimental::tool_requirements
+             .requires_global_fencing)) {
+      Kokkos::fence();
+    }
+    (*callback)(std::forward<Args>(args)...);
+  }
+}
 }  // namespace Experimental
 bool profileLibraryLoaded() {
   return !Experimental::eventSetsEqual(Experimental::current_callbacks,
@@ -110,11 +144,10 @@ bool profileLibraryLoaded() {
 
 void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID,
                       uint64_t* kernelID) {
-  if (Experimental::current_callbacks.begin_parallel_for != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.begin_parallel_for)(kernelPrefix.c_str(),
-                                                          devID, kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.begin_parallel_for, kernelPrefix.c_str(),
+      devID, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     auto context_id = Experimental::get_new_context_id();
@@ -130,10 +163,9 @@ void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID,
 }
 
 void endParallelFor(const uint64_t kernelID) {
-  if (Experimental::current_callbacks.end_parallel_for != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.end_parallel_for)(kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.end_parallel_for, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     Experimental::end_context(Experimental::get_current_context_id());
@@ -143,11 +175,10 @@ void endParallelFor(const uint64_t kernelID) {
 
 void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID,
                        uint64_t* kernelID) {
-  if (Experimental::current_callbacks.begin_parallel_scan != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.begin_parallel_scan)(kernelPrefix.c_str(),
-                                                           devID, kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.begin_parallel_scan, kernelPrefix.c_str(),
+      devID, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     auto context_id = Experimental::get_new_context_id();
@@ -163,10 +194,9 @@ void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID,
 }
 
 void endParallelScan(const uint64_t kernelID) {
-  if (Experimental::current_callbacks.end_parallel_scan != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.end_parallel_scan)(kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.end_parallel_scan, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     Experimental::end_context(Experimental::get_current_context_id());
@@ -176,11 +206,10 @@ void endParallelScan(const uint64_t kernelID) {
 
 void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID,
                          uint64_t* kernelID) {
-  if (Experimental::current_callbacks.begin_parallel_reduce != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.begin_parallel_reduce)(
-        kernelPrefix.c_str(), devID, kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.begin_parallel_reduce,
+      kernelPrefix.c_str(), devID, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     auto context_id = Experimental::get_new_context_id();
@@ -196,10 +225,9 @@ void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID,
 }
 
 void endParallelReduce(const uint64_t kernelID) {
-  if (Experimental::current_callbacks.end_parallel_reduce != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.end_parallel_reduce)(kernelID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.end_parallel_reduce, kernelID);
 #ifdef KOKKOS_ENABLE_TUNING
   if (Kokkos::tune_internals()) {
     Experimental::end_context(Experimental::get_current_context_id());
@@ -208,44 +236,43 @@ void endParallelReduce(const uint64_t kernelID) {
 }
 
 void pushRegion(const std::string& kName) {
-  if (Experimental::current_callbacks.push_region != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.push_region)(kName.c_str());
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.push_region, kName.c_str());
 }
 
 void popRegion() {
-  if (Experimental::current_callbacks.pop_region != nullptr) {
-    Kokkos::fence();
-    (*Experimental::current_callbacks.pop_region)();
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::Yes,
+      Experimental::current_callbacks.pop_region);
 }
 
 void allocateData(const SpaceHandle space, const std::string label,
                   const void* ptr, const uint64_t size) {
-  if (Experimental::current_callbacks.allocate_data != nullptr) {
-    (*Experimental::current_callbacks.allocate_data)(space, label.c_str(), ptr,
-                                                     size);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.allocate_data, space, label.c_str(), ptr,
+      size);
 }
 
 void deallocateData(const SpaceHandle space, const std::string label,
                     const void* ptr, const uint64_t size) {
-  if (Experimental::current_callbacks.deallocate_data != nullptr) {
-    (*Experimental::current_callbacks.deallocate_data)(space, label.c_str(),
-                                                       ptr, size);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.deallocate_data, space, label.c_str(),
+      ptr, size);
 }
 
 void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label,
                    const void* dst_ptr, const SpaceHandle src_space,
                    const std::string src_label, const void* src_ptr,
                    const uint64_t size) {
-  if (Experimental::current_callbacks.begin_deep_copy != nullptr) {
-    (*Experimental::current_callbacks.begin_deep_copy)(
-        dst_space, dst_label.c_str(), dst_ptr, src_space, src_label.c_str(),
-        src_ptr, size);
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.begin_deep_copy, dst_space,
+      dst_label.c_str(), dst_ptr, src_space, src_label.c_str(), src_ptr, size);
 #ifdef KOKKOS_ENABLE_TUNING
+  if (Experimental::current_callbacks.begin_deep_copy != nullptr) {
     if (Kokkos::tune_internals()) {
       auto context_id = Experimental::get_new_context_id();
       Experimental::begin_context(context_id);
@@ -257,64 +284,128 @@ void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label,
               Experimental::kernel_type_context_variable_id, "deep_copy")};
       Experimental::set_input_values(context_id, 2, contextValues);
     }
-#endif
   }
+#endif
 }
 
 void endDeepCopy() {
-  if (Experimental::current_callbacks.end_deep_copy != nullptr) {
-    (*Experimental::current_callbacks.end_deep_copy)();
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.end_deep_copy);
 #ifdef KOKKOS_ENABLE_TUNING
+  if (Experimental::current_callbacks.end_deep_copy != nullptr) {
     if (Kokkos::tune_internals()) {
       Experimental::end_context(Experimental::get_current_context_id());
     }
-#endif
   }
+#endif
 }
 
 void beginFence(const std::string name, const uint32_t deviceId,
                 uint64_t* handle) {
-  if (Experimental::current_callbacks.begin_fence != nullptr) {
-    (*Experimental::current_callbacks.begin_fence)(name.c_str(), deviceId,
-                                                   handle);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.begin_fence, name.c_str(), deviceId,
+      handle);
 }
 
 void endFence(const uint64_t handle) {
-  if (Experimental::current_callbacks.end_fence != nullptr) {
-    (*Experimental::current_callbacks.end_fence)(handle);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.end_fence, handle);
 }
 
 void createProfileSection(const std::string& sectionName, uint32_t* secID) {
-  if (Experimental::current_callbacks.create_profile_section != nullptr) {
-    (*Experimental::current_callbacks.create_profile_section)(
-        sectionName.c_str(), secID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.create_profile_section,
+      sectionName.c_str(), secID);
 }
 
 void startSection(const uint32_t secID) {
-  if (Experimental::current_callbacks.start_profile_section != nullptr) {
-    (*Experimental::current_callbacks.start_profile_section)(secID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.start_profile_section, secID);
 }
 
 void stopSection(const uint32_t secID) {
-  if (Experimental::current_callbacks.stop_profile_section != nullptr) {
-    (*Experimental::current_callbacks.stop_profile_section)(secID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.stop_profile_section, secID);
 }
 
 void destroyProfileSection(const uint32_t secID) {
-  if (Experimental::current_callbacks.destroy_profile_section != nullptr) {
-    (*Experimental::current_callbacks.destroy_profile_section)(secID);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.destroy_profile_section, secID);
 }
 
 void markEvent(const std::string& eventName) {
-  if (Experimental::current_callbacks.profile_event != nullptr) {
-    (*Experimental::current_callbacks.profile_event)(eventName.c_str());
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.profile_event, eventName.c_str());
+}
+
+bool printHelp(const std::string& args) {
+  if (Experimental::current_callbacks.print_help == nullptr) {
+    return false;
   }
+  std::string arg0  = args.substr(0, args.find_first_of(' '));
+  const char* carg0 = arg0.c_str();
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.print_help, const_cast<char*>(carg0));
+  return true;
+}
+
+void parseArgs(int _argc, char** _argv) {
+  if (Experimental::current_callbacks.parse_args != nullptr && _argc > 0) {
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.parse_args, _argc, _argv);
+  }
+}
+
+void parseArgs(const std::string& args) {
+  if (Experimental::current_callbacks.parse_args == nullptr) {
+    return;
+  }
+  using strvec_t = std::vector<std::string>;
+  auto tokenize  = [](const std::string& line, const std::string& delimiters) {
+    strvec_t _result{};
+    std::size_t _bidx = 0;  // position that is the beginning of the new string
+    std::size_t _didx = 0;  // position of the delimiter in the string
+    while (_bidx < line.length() && _didx < line.length()) {
+      // find the first character (starting at _didx) that is not a delimiter
+      _bidx = line.find_first_not_of(delimiters, _didx);
+      // if no more non-delimiter chars, done
+      if (_bidx == std::string::npos) break;
+      // starting at the position of the new string, find the next delimiter
+      _didx = line.find_first_of(delimiters, _bidx);
+      // starting at the position of the new string, get the characters
+      // between this position and the next delimiter
+      std::string _tmp = line.substr(_bidx, _didx - _bidx);
+      // don't add empty strings
+      if (!_tmp.empty()) _result.emplace_back(_tmp);
+    }
+    return _result;
+  };
+  auto vargs = tokenize(args, " \t");
+  if (vargs.size() == 0) return;
+  auto _argc          = static_cast<int>(vargs.size());
+  char** _argv        = new char*[_argc + 1];
+  _argv[vargs.size()] = nullptr;
+  for (int i = 0; i < _argc; ++i) {
+    auto& _str = vargs.at(i);
+    _argv[i]   = new char[_str.length() + 1];
+    std::memcpy(_argv[i], _str.c_str(), _str.length() * sizeof(char));
+    _argv[i][_str.length()] = '\0';
+  }
+  parseArgs(_argc, _argv);
+  for (int i = 0; i < _argc; ++i) {
+    delete[] _argv[i];
+  }
+  delete[] _argv;
 }
 
 SpaceHandle make_space_handle(const char* space_name) {
@@ -323,7 +414,19 @@ SpaceHandle make_space_handle(const char* space_name) {
   return handle;
 }
 
-void initialize() {
+template <typename Callback>
+void lookup_function(void* dlopen_handle, const std::string& basename,
+                     Callback& callback) {
+#ifdef KOKKOS_ENABLE_LIBDL
+  // dlsym returns a pointer to an object, while we want to assign to
+  // pointer to function A direct cast will give warnings hence, we have to
+  // workaround the issue by casting pointer to pointers.
+  void* p  = dlsym(dlopen_handle, basename.c_str());
+  callback = *reinterpret_cast<Callback*>(&p);
+#endif
+}
+
+void initialize(const std::string& profileLibrary) {
   // Make sure initialize calls happens only once
   static int is_initialized = 0;
   if (is_initialized) return;
@@ -332,13 +435,9 @@ void initialize() {
 #ifdef KOKKOS_ENABLE_LIBDL
   void* firstProfileLibrary = nullptr;
 
-  char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
+  if (profileLibrary.empty()) return;
 
-  // If we do not find a profiling library in the environment then exit
-  // early.
-  if (envProfileLibrary == nullptr) {
-    return;
-  }
+  char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
   char* envProfileCopy =
       (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
@@ -361,118 +460,139 @@ void initialize() {
       std::cout << "KokkosP: Library Loaded: " << profileLibraryName
                 << std::endl;
 #endif
-      // dlsym returns a pointer to an object, while we want to assign to
-      // pointer to function A direct cast will give warnings hence, we have to
-      // workaround the issue by casting pointer to pointers.
-      auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
-      Experimental::set_begin_parallel_for_callback(
-          *reinterpret_cast<beginFunction*>(&p1));
-      auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
-      Experimental::set_begin_parallel_scan_callback(
-          *reinterpret_cast<beginFunction*>(&p2));
-      auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
-      Experimental::set_begin_parallel_reduce_callback(
-          *reinterpret_cast<beginFunction*>(&p3));
-
-      auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
-      Experimental::set_end_parallel_scan_callback(
-          *reinterpret_cast<endFunction*>(&p4));
-      auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
-      Experimental::set_end_parallel_for_callback(
-          *reinterpret_cast<endFunction*>(&p5));
-      auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
-      Experimental::set_end_parallel_reduce_callback(
-          *reinterpret_cast<endFunction*>(&p6));
-
-      auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
-      Experimental::set_init_callback(*reinterpret_cast<initFunction*>(&p7));
-      auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
-      Experimental::set_finalize_callback(
-          *reinterpret_cast<finalizeFunction*>(&p8));
-
-      auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
-      Experimental::set_push_region_callback(
-          *reinterpret_cast<pushFunction*>(&p9));
-      auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
-      Experimental::set_pop_region_callback(
-          *reinterpret_cast<popFunction*>(&p10));
-
-      auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
-      Experimental::set_allocate_data_callback(
-          *reinterpret_cast<allocateDataFunction*>(&p11));
-      auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
-      Experimental::set_deallocate_data_callback(
-          *reinterpret_cast<deallocateDataFunction*>(&p12));
-
-      auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
-      Experimental::set_begin_deep_copy_callback(
-          *reinterpret_cast<beginDeepCopyFunction*>(&p13));
-      auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
-      Experimental::set_end_deep_copy_callback(
-          *reinterpret_cast<endDeepCopyFunction*>(&p14));
-
-      auto p15 = dlsym(firstProfileLibrary, "kokkosp_begin_fence");
-      Experimental::set_begin_fence_callback(
-          *reinterpret_cast<beginFenceFunction*>(&p15));
-      auto p16 = dlsym(firstProfileLibrary, "kokkosp_end_fence");
-      Experimental::set_end_fence_callback(
-          *reinterpret_cast<endFenceFunction*>(&p16));
-
-      auto p17 = dlsym(firstProfileLibrary, "kokkosp_dual_view_sync");
-      Experimental::set_dual_view_sync_callback(
-          *reinterpret_cast<dualViewSyncFunction*>(&p17));
-      auto p18 = dlsym(firstProfileLibrary, "kokkosp_dual_view_modify");
-      Experimental::set_dual_view_modify_callback(
-          *reinterpret_cast<dualViewModifyFunction*>(&p18));
-
-      auto p19 = dlsym(firstProfileLibrary, "kokkosp_create_profile_section");
-      Experimental::set_create_profile_section_callback(
-          *(reinterpret_cast<createProfileSectionFunction*>(&p19)));
-      auto p20 = dlsym(firstProfileLibrary, "kokkosp_start_profile_section");
-      Experimental::set_start_profile_section_callback(
-          *reinterpret_cast<startProfileSectionFunction*>(&p20));
-      auto p21 = dlsym(firstProfileLibrary, "kokkosp_stop_profile_section");
-      Experimental::set_stop_profile_section_callback(
-          *reinterpret_cast<stopProfileSectionFunction*>(&p21));
-      auto p22 = dlsym(firstProfileLibrary, "kokkosp_destroy_profile_section");
-      Experimental::set_destroy_profile_section_callback(
-          *(reinterpret_cast<destroyProfileSectionFunction*>(&p22)));
-
-      auto p23 = dlsym(firstProfileLibrary, "kokkosp_profile_event");
-      Experimental::set_profile_event_callback(
-          *reinterpret_cast<profileEventFunction*>(&p23));
-
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_parallel_scan",
+          Kokkos::Tools::Experimental::current_callbacks.begin_parallel_scan);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_parallel_for",
+          Kokkos::Tools::Experimental::current_callbacks.begin_parallel_for);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_parallel_reduce",
+          Kokkos::Tools::Experimental::current_callbacks.begin_parallel_reduce);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_end_parallel_scan",
+          Kokkos::Tools::Experimental::current_callbacks.end_parallel_scan);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_end_parallel_for",
+          Kokkos::Tools::Experimental::current_callbacks.end_parallel_for);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_end_parallel_reduce",
+          Kokkos::Tools::Experimental::current_callbacks.end_parallel_reduce);
+
+      lookup_function(firstProfileLibrary, "kokkosp_init_library",
+                      Kokkos::Tools::Experimental::current_callbacks.init);
+      lookup_function(firstProfileLibrary, "kokkosp_finalize_library",
+                      Kokkos::Tools::Experimental::current_callbacks.finalize);
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_push_profile_region",
+          Kokkos::Tools::Experimental::current_callbacks.push_region);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_pop_profile_region",
+          Kokkos::Tools::Experimental::current_callbacks.pop_region);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_allocate_data",
+          Kokkos::Tools::Experimental::current_callbacks.allocate_data);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_deallocate_data",
+          Kokkos::Tools::Experimental::current_callbacks.deallocate_data);
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_deep_copy",
+          Kokkos::Tools::Experimental::current_callbacks.begin_deep_copy);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_end_deep_copy",
+          Kokkos::Tools::Experimental::current_callbacks.end_deep_copy);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_fence",
+          Kokkos::Tools::Experimental::current_callbacks.begin_fence);
+      lookup_function(firstProfileLibrary, "kokkosp_end_fence",
+                      Kokkos::Tools::Experimental::current_callbacks.end_fence);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_dual_view_sync",
+          Kokkos::Tools::Experimental::current_callbacks.sync_dual_view);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_dual_view_modify",
+          Kokkos::Tools::Experimental::current_callbacks.modify_dual_view);
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_declare_metadata",
+          Kokkos::Tools::Experimental::current_callbacks.declare_metadata);
+      lookup_function(firstProfileLibrary, "kokkosp_create_profile_section",
+                      Kokkos::Tools::Experimental::current_callbacks
+                          .create_profile_section);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_start_profile_section",
+          Kokkos::Tools::Experimental::current_callbacks.start_profile_section);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_stop_profile_section",
+          Kokkos::Tools::Experimental::current_callbacks.stop_profile_section);
+      lookup_function(firstProfileLibrary, "kokkosp_destroy_profile_section",
+                      Kokkos::Tools::Experimental::current_callbacks
+                          .destroy_profile_section);
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_profile_event",
+          Kokkos::Tools::Experimental::current_callbacks.profile_event);
 #ifdef KOKKOS_ENABLE_TUNING
-      auto p24 = dlsym(firstProfileLibrary, "kokkosp_declare_output_type");
-      Experimental::set_declare_output_type_callback(
-          *reinterpret_cast<Experimental::outputTypeDeclarationFunction*>(
-              &p24));
-
-      auto p25 = dlsym(firstProfileLibrary, "kokkosp_declare_input_type");
-      Experimental::set_declare_input_type_callback(
-          *reinterpret_cast<Experimental::inputTypeDeclarationFunction*>(&p25));
-      auto p26 = dlsym(firstProfileLibrary, "kokkosp_request_values");
-      Experimental::set_request_output_values_callback(
-          *reinterpret_cast<Experimental::requestValueFunction*>(&p26));
-      auto p27 = dlsym(firstProfileLibrary, "kokkosp_end_context");
-      Experimental::set_end_context_callback(
-          *reinterpret_cast<Experimental::contextEndFunction*>(&p27));
-      auto p28 = dlsym(firstProfileLibrary, "kokkosp_begin_context");
-      Experimental::set_begin_context_callback(
-          *reinterpret_cast<Experimental::contextBeginFunction*>(&p28));
-      auto p29 =
-          dlsym(firstProfileLibrary, "kokkosp_declare_optimization_goal");
-      Experimental::set_declare_optimization_goal_callback(
-          *reinterpret_cast<Experimental::optimizationGoalDeclarationFunction*>(
-              &p29));
+      lookup_function(
+          firstProfileLibrary, "kokkosp_declare_output_type",
+          Kokkos::Tools::Experimental::current_callbacks.declare_output_type);
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_declare_input_type",
+          Kokkos::Tools::Experimental::current_callbacks.declare_input_type);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_request_values",
+          Kokkos::Tools::Experimental::current_callbacks.request_output_values);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_end_context",
+          Kokkos::Tools::Experimental::current_callbacks.end_tuning_context);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_begin_context",
+          Kokkos::Tools::Experimental::current_callbacks.begin_tuning_context);
+      lookup_function(firstProfileLibrary, "kokkosp_declare_optimization_goal",
+                      Kokkos::Tools::Experimental::current_callbacks
+                          .declare_optimization_goal);
 #endif  // KOKKOS_ENABLE_TUNING
+
+      lookup_function(
+          firstProfileLibrary, "kokkosp_print_help",
+          Kokkos::Tools::Experimental::current_callbacks.print_help);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_parse_args",
+          Kokkos::Tools::Experimental::current_callbacks.parse_args);
+      lookup_function(firstProfileLibrary,
+                      "kokkosp_provide_tool_programming_interface",
+                      Kokkos::Tools::Experimental::current_callbacks
+                          .provide_tool_programming_interface);
+      lookup_function(
+          firstProfileLibrary, "kokkosp_request_tool_settings",
+          Kokkos::Tools::Experimental::current_callbacks.request_tool_settings);
     }
   }
+#else
+  (void)profileLibrary;
 #endif  // KOKKOS_ENABLE_LIBDL
-  if (Experimental::current_callbacks.init != nullptr) {
-    (*Experimental::current_callbacks.init)(
-        0, (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
+      Kokkos::Tools::Experimental::current_callbacks.init, 0,
+      (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
+
+  Experimental::tool_requirements.requires_global_fencing = true;
+
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.request_tool_settings, 1,
+      &Experimental::tool_requirements);
+
+  Experimental::ToolProgrammingInterface actions;
+  actions.fence = &Experimental::Impl::tool_invoked_fence;
+
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.provide_tool_programming_interface, 1,
+      actions);
 
 #ifdef KOKKOS_ENABLE_TUNING
   Experimental::VariableInfo kernel_name;
@@ -548,7 +668,9 @@ void finalize() {
   is_finalized = 1;
 
   if (Experimental::current_callbacks.finalize != nullptr) {
-    (*Experimental::current_callbacks.finalize)();
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.finalize);
 
     Experimental::pause_tools();
   }
@@ -568,17 +690,24 @@ void finalize() {
 
 void syncDualView(const std::string& label, const void* const ptr,
                   bool to_device) {
-  if (Experimental::current_callbacks.sync_dual_view != nullptr) {
-    (*Experimental::current_callbacks.sync_dual_view)(label.c_str(), ptr,
-                                                      to_device);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.sync_dual_view, label.c_str(), ptr,
+      to_device);
 }
 void modifyDualView(const std::string& label, const void* const ptr,
                     bool on_device) {
-  if (Experimental::current_callbacks.modify_dual_view != nullptr) {
-    (*Experimental::current_callbacks.modify_dual_view)(label.c_str(), ptr,
-                                                        on_device);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.modify_dual_view, label.c_str(), ptr,
+      on_device);
+}
+
+void declareMetadata(const std::string& key, const std::string& value) {
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.declare_metadata, key.c_str(),
+      value.c_str());
 }
 
 }  // namespace Tools
@@ -591,6 +720,12 @@ void set_init_callback(initFunction callback) {
 void set_finalize_callback(finalizeFunction callback) {
   current_callbacks.finalize = callback;
 }
+void set_parse_args_callback(parseArgsFunction callback) {
+  current_callbacks.parse_args = callback;
+}
+void set_print_help_callback(printHelpFunction callback) {
+  current_callbacks.print_help = callback;
+}
 void set_begin_parallel_for_callback(beginFunction callback) {
   current_callbacks.begin_parallel_for = callback;
 }
@@ -657,6 +792,9 @@ void set_dual_view_sync_callback(dualViewSyncFunction callback) {
 void set_dual_view_modify_callback(dualViewModifyFunction callback) {
   current_callbacks.modify_dual_view = callback;
 }
+void set_declare_metadata_callback(declareMetadataFunction callback) {
+  current_callbacks.declare_metadata = callback;
+}
 
 void set_declare_output_type_callback(outputTypeDeclarationFunction callback) {
   current_callbacks.declare_output_type = callback;
@@ -751,7 +889,17 @@ void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label,
 void endDeepCopy() { Kokkos::Tools::endDeepCopy(); }
 
 void finalize() { Kokkos::Tools::finalize(); }
-void initialize() { Kokkos::Tools::initialize(); }
+void initialize(const std::string& profileLibrary) {
+  Kokkos::Tools::initialize(profileLibrary);
+}
+
+bool printHelp(const std::string& args) {
+  return Kokkos::Tools::printHelp(args);
+}
+void parseArgs(const std::string& args) { Kokkos::Tools::parseArgs(args); }
+void parseArgs(int _argc, char** _argv) {
+  Kokkos::Tools::parseArgs(_argc, _argv);
+}
 
 SpaceHandle make_space_handle(const char* space_name) {
   return Kokkos::Tools::make_space_handle(space_name);
@@ -782,10 +930,10 @@ size_t get_new_variable_id() { return get_variable_counter(); }
 size_t declare_output_type(const std::string& variableName, VariableInfo info) {
   size_t variableId = get_new_variable_id();
 #ifdef KOKKOS_ENABLE_TUNING
-  if (Experimental::current_callbacks.declare_output_type != nullptr) {
-    (*Experimental::current_callbacks.declare_output_type)(variableName.c_str(),
-                                                           variableId, &info);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.declare_output_type, variableName.c_str(),
+      variableId, &info);
   variable_metadata[variableId] = info;
 #else
   (void)variableName;
@@ -797,10 +945,10 @@ size_t declare_output_type(const std::string& variableName, VariableInfo info) {
 size_t declare_input_type(const std::string& variableName, VariableInfo info) {
   size_t variableId = get_new_variable_id();
 #ifdef KOKKOS_ENABLE_TUNING
-  if (Experimental::current_callbacks.declare_input_type != nullptr) {
-    (*Experimental::current_callbacks.declare_input_type)(variableName.c_str(),
-                                                          variableId, &info);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.declare_input_type, variableName.c_str(),
+      variableId, &info);
   variable_metadata[variableId] = info;
 #else
   (void)variableName;
@@ -839,8 +987,10 @@ void request_output_values(size_t contextId, size_t count,
     for (size_t x = 0; x < count; ++x) {
       values[x].metadata = &variable_metadata[values[x].type_id];
     }
-    (*Experimental::current_callbacks.request_output_values)(
-        contextId, context_values.size(), context_values.data(), count, values);
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.request_output_values, contextId,
+        context_values.size(), context_values.data(), count, values);
   }
 #else
   (void)contextId;
@@ -854,19 +1004,19 @@ static std::unordered_map<size_t, size_t> optimization_goals;
 #endif
 
 void begin_context(size_t contextId) {
-  if (Experimental::current_callbacks.begin_tuning_context != nullptr) {
-    (*Experimental::current_callbacks.begin_tuning_context)(contextId);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.begin_tuning_context, contextId);
 }
 void end_context(size_t contextId) {
 #ifdef KOKKOS_ENABLE_TUNING
   for (auto id : features_per_context[contextId]) {
     active_features.erase(id);
   }
-  if (Experimental::current_callbacks.end_tuning_context != nullptr) {
-    (*Experimental::current_callbacks.end_tuning_context)(
-        contextId, feature_values[optimization_goals[contextId]]);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.end_tuning_context, contextId,
+      feature_values[optimization_goals[contextId]]);
   optimization_goals.erase(contextId);
   decrement_current_context_id();
 #else
@@ -954,9 +1104,9 @@ size_t get_new_variable_id();
 void declare_optimization_goal(const size_t context,
                                const OptimizationGoal& goal) {
 #ifdef KOKKOS_ENABLE_TUNING
-  if (Experimental::current_callbacks.declare_optimization_goal != nullptr) {
-    (*Experimental::current_callbacks.declare_optimization_goal)(context, goal);
-  }
+  Experimental::invoke_kokkosp_callback(
+      Experimental::MayRequireGlobalFencing::No,
+      Experimental::current_callbacks.declare_optimization_goal, context, goal);
   optimization_goals[context] = goal.type_id;
 #else
   (void)context;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 688937623761f72b7d4eacfad9cc27e5a10c57eb..1ff6a36c3bc3c934e787af30c5bd6568046f15f1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -45,13 +45,13 @@
 #ifndef KOKKOS_IMPL_KOKKOS_PROFILING_HPP
 #define KOKKOS_IMPL_KOKKOS_PROFILING_HPP
 
-#include <impl/Kokkos_Profiling_Interface.hpp>
-#include <Kokkos_Macros.hpp>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
-#include <string>
+#include <impl/Kokkos_Profiling_Interface.hpp>
 #include <map>
+#include <string>
 #include <type_traits>
 namespace Kokkos {
 
@@ -125,8 +125,11 @@ void syncDualView(const std::string& label, const void* const ptr,
 void modifyDualView(const std::string& label, const void* const ptr,
                     bool on_device);
 
-void initialize();
+void declareMetadata(const std::string& key, const std::string& value);
+void initialize(const std::string& = {});
 void finalize();
+bool printHelp(const std::string&);
+void parseArgs(const std::string&);
 
 Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name);
 
@@ -134,6 +137,8 @@ namespace Experimental {
 
 void set_init_callback(initFunction callback);
 void set_finalize_callback(finalizeFunction callback);
+void set_parse_args_callback(parseArgsFunction callback);
+void set_print_help_callback(printHelpFunction callback);
 void set_begin_parallel_for_callback(beginFunction callback);
 void set_end_parallel_for_callback(endFunction callback);
 void set_begin_parallel_reduce_callback(beginFunction callback);
@@ -156,6 +161,7 @@ void set_begin_fence_callback(beginFenceFunction callback);
 void set_end_fence_callback(endFenceFunction callback);
 void set_dual_view_sync_callback(dualViewSyncFunction callback);
 void set_dual_view_modify_callback(dualViewModifyFunction callback);
+void set_declare_metadata_callback(declareMetadataFunction callback);
 
 void set_declare_output_type_callback(outputTypeDeclarationFunction callback);
 void set_declare_input_type_callback(inputTypeDeclarationFunction callback);
@@ -183,10 +189,19 @@ namespace Impl {
 static std::map<std::string, Kokkos::Tools::Experimental::TeamSizeTuner>
     team_tuners;
 
+template <int Rank>
+using MDRangeTuningMap =
+    std::map<std::string, Kokkos::Tools::Experimental::MDRangeTuner<Rank>>;
+
+template <int Rank>
+static MDRangeTuningMap<Rank> mdrange_tuners;
+
+// For any policies without a tuning implementation, with a reducer
 template <class ReducerType, class ExecPolicy, class Functor, typename TagType>
 void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&,
                  TagType) {}
 
+// For any policies without a tuning implementation, without a reducer
 template <class ExecPolicy, class Functor, typename TagType>
 void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&,
                  const TagType&) {}
@@ -225,6 +240,24 @@ struct SimpleTeamSizeCalculator {
     auto max = policy.team_size_recommended(functor, tag);
     return max;
   }
+  template <typename Policy, typename Functor>
+  int get_mdrange_max_tile_size_product(const Policy& policy,
+                                        const Functor& functor,
+                                        const Kokkos::ParallelForTag&) {
+    using exec_space = typename Policy::execution_space;
+    using driver     = Kokkos::Impl::ParallelFor<Functor, Policy, exec_space>;
+    return driver::max_tile_size_product(policy, functor);
+  }
+  template <typename Policy, typename Functor>
+  int get_mdrange_max_tile_size_product(const Policy& policy,
+                                        const Functor& functor,
+                                        const Kokkos::ParallelReduceTag&) {
+    using exec_space = typename Policy::execution_space;
+    using driver =
+        Kokkos::Impl::ParallelReduce<Functor, Policy, Kokkos::InvalidType,
+                                     exec_space>;
+    return driver::max_tile_size_product(policy, functor);
+  }
 };
 
 // when we have a complex reducer, we need to pass an
@@ -251,15 +284,25 @@ struct ComplexReducerSizeCalculator {
     ReducerType reducer_example = ReducerType(value);
     return policy.team_size_recommended(functor, reducer_example, tag);
   }
+  template <typename Policy, typename Functor>
+  int get_mdrange_max_tile_size_product(const Policy& policy,
+                                        const Functor& functor,
+                                        const Kokkos::ParallelReduceTag&) {
+    using exec_space = typename Policy::execution_space;
+    using driver =
+        Kokkos::Impl::ParallelReduce<Functor, Policy, ReducerType, exec_space>;
+    return driver::max_tile_size_product(policy, functor);
+  }
 };
 
 }  // namespace Impl
 
-template <class Functor, class TagType, class... Properties>
-void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
-                 Kokkos::TeamPolicy<Properties...>& policy,
-                 const Functor& functor, const TagType& tag) {
-  if (policy.impl_auto_team_size() || policy.impl_auto_vector_length()) {
+template <class Tuner, class Functor, class TagType,
+          class TuningPermissionFunctor, class Map, class Policy>
+void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy,
+                         const Functor& functor, const TagType& tag,
+                         const TuningPermissionFunctor& should_tune) {
+  if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
       using policy_type =
@@ -269,12 +312,10 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
       label = name.get();
     }
     auto tuner_iter = [&]() {
-      auto my_tuner = team_tuners.find(label);
-      if (my_tuner == team_tuners.end()) {
-        return (team_tuners
-                    .emplace(label, Kokkos::Tools::Experimental::TeamSizeTuner(
-                                        label, policy, functor, tag,
-                                        Impl::SimpleTeamSizeCalculator{}))
+      auto my_tuner = map.find(label);
+      if (my_tuner == map.end()) {
+        return (map.emplace(label, Tuner(label, policy, functor, tag,
+                                         Impl::SimpleTeamSizeCalculator{}))
                     .first);
       }
       return my_tuner;
@@ -282,12 +323,12 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
     tuner_iter->second.tune(policy);
   }
 }
-
-template <class ReducerType, class Functor, class TagType, class... Properties>
-void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
-                 Kokkos::TeamPolicy<Properties...>& policy,
-                 const Functor& functor, const TagType& tag) {
-  if (policy.impl_auto_team_size() || policy.impl_auto_vector_length()) {
+template <class Tuner, class ReducerType, class Functor, class TagType,
+          class TuningPermissionFunctor, class Map, class Policy>
+void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy,
+                         const Functor& functor, const TagType& tag,
+                         const TuningPermissionFunctor& should_tune) {
+  if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
       using policy_type =
@@ -297,15 +338,13 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
       label = name.get();
     }
     auto tuner_iter = [&]() {
-      auto my_tuner = team_tuners.find(label);
-      if (my_tuner == team_tuners.end()) {
-        return (
-            team_tuners
-                .emplace(label,
-                         Kokkos::Tools::Experimental::TeamSizeTuner(
-                             label, policy, functor, tag,
+      auto my_tuner = map.find(label);
+      if (my_tuner == map.end()) {
+        return (map.emplace(
+                       label,
+                       Tuner(label, policy, functor, tag,
                              Impl::ComplexReducerSizeCalculator<ReducerType>{}))
-                .first);
+                    .first);
       }
       return my_tuner;
     }();
@@ -313,6 +352,60 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
   }
 }
 
+// tune a TeamPolicy, without reducer
+template <class Functor, class TagType, class... Properties>
+void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
+                 Kokkos::TeamPolicy<Properties...>& policy,
+                 const Functor& functor, const TagType& tag) {
+  generic_tune_policy<Experimental::TeamSizeTuner>(
+      label_in, team_tuners, policy, functor, tag,
+      [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) {
+        return (candidate_policy.impl_auto_team_size() ||
+                candidate_policy.impl_auto_vector_length());
+      });
+}
+
+// tune a TeamPolicy, with reducer
+template <class ReducerType, class Functor, class TagType, class... Properties>
+void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
+                 Kokkos::TeamPolicy<Properties...>& policy,
+                 const Functor& functor, const TagType& tag) {
+  generic_tune_policy<Experimental::TeamSizeTuner, ReducerType>(
+      label_in, team_tuners, policy, functor, tag,
+      [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) {
+        return (candidate_policy.impl_auto_team_size() ||
+                candidate_policy.impl_auto_vector_length());
+      });
+}
+
+// tune a MDRangePolicy, without reducer
+template <class Functor, class TagType, class... Properties>
+void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
+                 Kokkos::MDRangePolicy<Properties...>& policy,
+                 const Functor& functor, const TagType& tag) {
+  using Policy              = Kokkos::MDRangePolicy<Properties...>;
+  static constexpr int rank = Policy::rank;
+  generic_tune_policy<Experimental::MDRangeTuner<rank>>(
+      label_in, mdrange_tuners<rank>, policy, functor, tag,
+      [](const Policy& candidate_policy) {
+        return candidate_policy.impl_tune_tile_size();
+      });
+}
+
+// tune a MDRangePolicy, with reducer
+template <class ReducerType, class Functor, class TagType, class... Properties>
+void tune_policy(const size_t /**tuning_context*/, const std::string& label_in,
+                 Kokkos::MDRangePolicy<Properties...>& policy,
+                 const Functor& functor, const TagType& tag) {
+  using Policy              = Kokkos::MDRangePolicy<Properties...>;
+  static constexpr int rank = Policy::rank;
+  generic_tune_policy<Experimental::MDRangeTuner<rank>, ReducerType>(
+      label_in, mdrange_tuners<rank>, policy, functor, tag,
+      [](const Policy& candidate_policy) {
+        return candidate_policy.impl_tune_tile_size();
+      });
+}
+
 template <class ReducerType>
 struct ReductionSwitcher {
   template <class Functor, class TagType, class ExecPolicy>
@@ -337,16 +430,12 @@ struct ReductionSwitcher<Kokkos::InvalidType> {
   }
 };
 
-template <class ExecPolicy, class Functor, typename TagType>
-void report_policy_results(const size_t, const std::string&, ExecPolicy&,
-                           const Functor&, const TagType&) {}
-
-template <class Functor, class TagType, class... Properties>
-void report_policy_results(const size_t /**tuning_context*/,
-                           const std::string& label_in,
-                           Kokkos::TeamPolicy<Properties...> policy,
-                           const Functor&, const TagType&) {
-  if (policy.impl_auto_team_size() || policy.impl_auto_vector_length()) {
+template <class Tuner, class Functor, class TagType,
+          class TuningPermissionFunctor, class Map, class Policy>
+void generic_report_results(const std::string& label_in, Map& map,
+                            Policy& policy, const Functor&, const TagType&,
+                            const TuningPermissionFunctor& should_tune) {
+  if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
       using policy_type =
@@ -355,11 +444,45 @@ void report_policy_results(const size_t /**tuning_context*/,
       Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label);
       label = name.get();
     }
-    auto& tuner = team_tuners[label];
-    tuner.end();
+    auto tuner_iter = map[label];
+    tuner_iter.end();
   }
 }
 
+// report results for a policy type we don't tune (do nothing)
+template <class ExecPolicy, class Functor, typename TagType>
+void report_policy_results(const size_t, const std::string&, ExecPolicy&,
+                           const Functor&, const TagType&) {}
+
+// report results for a TeamPolicy
+template <class Functor, class TagType, class... Properties>
+void report_policy_results(const size_t /**tuning_context*/,
+                           const std::string& label_in,
+                           Kokkos::TeamPolicy<Properties...>& policy,
+                           const Functor& functor, const TagType& tag) {
+  generic_report_results<Experimental::TeamSizeTuner>(
+      label_in, team_tuners, policy, functor, tag,
+      [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) {
+        return (candidate_policy.impl_auto_team_size() ||
+                candidate_policy.impl_auto_vector_length());
+      });
+}
+
+// report results for an MDRangePolicy
+template <class Functor, class TagType, class... Properties>
+void report_policy_results(const size_t /**tuning_context*/,
+                           const std::string& label_in,
+                           Kokkos::MDRangePolicy<Properties...>& policy,
+                           const Functor& functor, const TagType& tag) {
+  using Policy              = Kokkos::MDRangePolicy<Properties...>;
+  static constexpr int rank = Policy::rank;
+  generic_report_results<Experimental::MDRangeTuner<rank>>(
+      label_in, mdrange_tuners<rank>, policy, functor, tag,
+      [](const Policy& candidate_policy) {
+        return candidate_policy.impl_tune_tile_size();
+      });
+}
+
 template <class ExecPolicy, class FunctorType>
 void begin_parallel_for(ExecPolicy& policy, FunctorType& functor,
                         const std::string& label, uint64_t& kpID) {
@@ -515,7 +638,8 @@ void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label,
                    const uint64_t size);
 void endDeepCopy();
 void finalize();
-void initialize();
+void initialize(const std::string& = {});
+
 SpaceHandle make_space_handle(const char* space_name);
 
 namespace Experimental {
@@ -533,7 +657,9 @@ using Kokkos::Tools::Experimental::set_end_parallel_reduce_callback;
 using Kokkos::Tools::Experimental::set_end_parallel_scan_callback;
 using Kokkos::Tools::Experimental::set_finalize_callback;
 using Kokkos::Tools::Experimental::set_init_callback;
+using Kokkos::Tools::Experimental::set_parse_args_callback;
 using Kokkos::Tools::Experimental::set_pop_region_callback;
+using Kokkos::Tools::Experimental::set_print_help_callback;
 using Kokkos::Tools::Experimental::set_profile_event_callback;
 using Kokkos::Tools::Experimental::set_push_region_callback;
 using Kokkos::Tools::Experimental::set_start_profile_section_callback;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
index de771d330142e8313f2afc16811da0dcc77a04d4..ed8751c50cc04d915b7b3c371a6ec05756ff6087 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -54,7 +54,7 @@
 #include <stdbool.h>
 #endif
 
-#define KOKKOSP_INTERFACE_VERSION 20200625
+#define KOKKOSP_INTERFACE_VERSION 20210225
 
 // Profiling
 
@@ -73,6 +73,10 @@ typedef void (*Kokkos_Profiling_initFunction)(
 // NOLINTNEXTLINE(modernize-use-using): C compatibility
 typedef void (*Kokkos_Profiling_finalizeFunction)();
 // NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Profiling_parseArgsFunction)(int, char**);
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Profiling_printHelpFunction)(char*);
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
 typedef void (*Kokkos_Profiling_beginFunction)(const char*, const uint32_t,
                                                uint64_t*);
 // NOLINTNEXTLINE(modernize-use-using): C compatibility
@@ -123,6 +127,33 @@ typedef void (*Kokkos_Profiling_dualViewModifyFunction)(const char*,
                                                         const void* const,
                                                         bool);
 
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Profiling_declareMetadataFunction)(const char*,
+                                                         const char*);
+
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Tools_toolInvokedFenceFunction)(const uint32_t);
+
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Tools_functionPointer)();
+struct Kokkos_Tools_ToolProgrammingInterface {
+  Kokkos_Tools_toolInvokedFenceFunction fence;
+  // allow addition of more actions
+  Kokkos_Tools_functionPointer padding[31];
+};
+
+struct Kokkos_Tools_ToolSettings {
+  bool requires_global_fencing;
+  bool padding[255];
+};
+
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Tools_provideToolProgrammingInterfaceFunction)(
+    const uint32_t, struct Kokkos_Tools_ToolProgrammingInterface);
+// NOLINTNEXTLINE(modernize-use-using): C compatibility
+typedef void (*Kokkos_Tools_requestToolSettingsFunction)(
+    const uint32_t, struct Kokkos_Tools_ToolSettings*);
+
 // Tuning
 
 #define KOKKOS_TOOLS_TUNING_STRING_LENGTH 64
@@ -217,11 +248,11 @@ typedef void (*Kokkos_Tools_contextEndFunction)(
 typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)(
     const size_t, const struct Kokkos_Tools_OptimzationGoal goal);
 
-typedef void (*function_pointer)();
-
 struct Kokkos_Profiling_EventSet {
   Kokkos_Profiling_initFunction init;
   Kokkos_Profiling_finalizeFunction finalize;
+  Kokkos_Profiling_parseArgsFunction parse_args;
+  Kokkos_Profiling_printHelpFunction print_help;
   Kokkos_Profiling_beginFunction begin_parallel_for;
   Kokkos_Profiling_endFunction end_parallel_for;
   Kokkos_Profiling_beginFunction begin_parallel_reduce;
@@ -243,17 +274,23 @@ struct Kokkos_Profiling_EventSet {
   Kokkos_Profiling_endFenceFunction end_fence;
   Kokkos_Profiling_dualViewSyncFunction sync_dual_view;
   Kokkos_Profiling_dualViewModifyFunction modify_dual_view;
-  char profiling_padding[12 * sizeof(function_pointer)];
+  Kokkos_Profiling_declareMetadataFunction declare_metadata;
+  Kokkos_Tools_provideToolProgrammingInterfaceFunction
+      provide_tool_programming_interface;
+  Kokkos_Tools_requestToolSettingsFunction request_tool_settings;
+  char profiling_padding[9 * sizeof(Kokkos_Tools_functionPointer)];
   Kokkos_Tools_outputTypeDeclarationFunction declare_output_type;
   Kokkos_Tools_inputTypeDeclarationFunction declare_input_type;
   Kokkos_Tools_requestValueFunction request_output_values;
   Kokkos_Tools_contextBeginFunction begin_tuning_context;
   Kokkos_Tools_contextEndFunction end_tuning_context;
   Kokkos_Tools_optimizationGoalDeclarationFunction declare_optimization_goal;
-  char padding[234 *
-               sizeof(function_pointer)];  // allows us to add another 256
-                                           // events to the Tools interface
-                                           // without changing struct layout
+  char padding[232 *
+               sizeof(
+                   Kokkos_Tools_functionPointer)];  // allows us to add another
+                                                    // 256 events to the Tools
+                                                    // interface without
+                                                    // changing struct layout
 };
 
 #endif  // KOKKOS_PROFILING_C_INTERFACE_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index fad7a78e393d1dcf48468e735b5bd8b90f00c459..7809632f78ddf33d8429b353723736b68e3b7536 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -101,12 +101,29 @@ namespace Tools {
 
 namespace Experimental {
 using EventSet = Kokkos_Profiling_EventSet;
-static_assert(sizeof(EventSet) / sizeof(function_pointer) == 275,
+static_assert(sizeof(EventSet) / sizeof(Kokkos_Tools_functionPointer) == 275,
               "sizeof EventSet has changed, this is an error on the part of a "
               "Kokkos developer");
+static_assert(sizeof(Kokkos_Tools_ToolSettings) / sizeof(bool) == 256,
+              "sizeof EventSet has changed, this is an error on the part of a "
+              "Kokkos developer");
+static_assert(sizeof(Kokkos_Tools_ToolProgrammingInterface) /
+                      sizeof(Kokkos_Tools_functionPointer) ==
+                  32,
+              "sizeof EventSet has changed, this is an error on the part of a "
+              "Kokkos developer");
+
+using toolInvokedFenceFunction = Kokkos_Tools_toolInvokedFenceFunction;
+using provideToolProgrammingInterfaceFunction =
+    Kokkos_Tools_provideToolProgrammingInterfaceFunction;
+using requestToolSettingsFunction = Kokkos_Tools_requestToolSettingsFunction;
+using ToolSettings                = Kokkos_Tools_ToolSettings;
+using ToolProgrammingInterface    = Kokkos_Tools_ToolProgrammingInterface;
 }  // namespace Experimental
 using initFunction           = Kokkos_Profiling_initFunction;
 using finalizeFunction       = Kokkos_Profiling_finalizeFunction;
+using parseArgsFunction      = Kokkos_Profiling_parseArgsFunction;
+using printHelpFunction      = Kokkos_Profiling_printHelpFunction;
 using beginFunction          = Kokkos_Profiling_beginFunction;
 using endFunction            = Kokkos_Profiling_endFunction;
 using pushFunction           = Kokkos_Profiling_pushFunction;
@@ -120,13 +137,14 @@ using startProfileSectionFunction =
 using stopProfileSectionFunction = Kokkos_Profiling_stopProfileSectionFunction;
 using destroyProfileSectionFunction =
     Kokkos_Profiling_destroyProfileSectionFunction;
-using profileEventFunction   = Kokkos_Profiling_profileEventFunction;
-using beginDeepCopyFunction  = Kokkos_Profiling_beginDeepCopyFunction;
-using endDeepCopyFunction    = Kokkos_Profiling_endDeepCopyFunction;
-using beginFenceFunction     = Kokkos_Profiling_beginFenceFunction;
-using endFenceFunction       = Kokkos_Profiling_endFenceFunction;
-using dualViewSyncFunction   = Kokkos_Profiling_dualViewSyncFunction;
-using dualViewModifyFunction = Kokkos_Profiling_dualViewModifyFunction;
+using profileEventFunction    = Kokkos_Profiling_profileEventFunction;
+using beginDeepCopyFunction   = Kokkos_Profiling_beginDeepCopyFunction;
+using endDeepCopyFunction     = Kokkos_Profiling_endDeepCopyFunction;
+using beginFenceFunction      = Kokkos_Profiling_beginFenceFunction;
+using endFenceFunction        = Kokkos_Profiling_endFenceFunction;
+using dualViewSyncFunction    = Kokkos_Profiling_dualViewSyncFunction;
+using dualViewModifyFunction  = Kokkos_Profiling_dualViewModifyFunction;
+using declareMetadataFunction = Kokkos_Profiling_declareMetadataFunction;
 
 }  // namespace Tools
 
@@ -161,7 +179,9 @@ using Kokkos::Tools::endDeepCopyFunction;
 using Kokkos::Tools::endFunction;
 using Kokkos::Tools::finalizeFunction;
 using Kokkos::Tools::initFunction;
+using Kokkos::Tools::parseArgsFunction;
 using Kokkos::Tools::popFunction;
+using Kokkos::Tools::printHelpFunction;
 using Kokkos::Tools::profileEventFunction;
 using Kokkos::Tools::pushFunction;
 using Kokkos::Tools::SpaceHandle;
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
index 646280faee1e62e59046e5e9bcfa090cf19e21de..64dfd5d33fb8576b1cb5446843edefaaf6d67422 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -45,16 +45,29 @@
 #ifndef KOKKOS_SHARED_ALLOC_HPP
 #define KOKKOS_SHARED_ALLOC_HPP
 
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Error.hpp>  // Impl::throw_runtime_exception
+
 #include <cstdint>
 #include <string>
 
-// undefined at end of file
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
+// Base function.
+static constexpr bool kokkos_omp_on_host() { return true; }
 #if defined(KOKKOS_COMPILER_PGI)
 #define KOKKOS_IMPL_IF_ON_HOST if (!__builtin_is_device_code())
 #else
 // Note: OpenMPTarget enforces C++17 at configure time
-#define KOKKOS_IMPL_IF_ON_HOST if constexpr (omp_is_initial_device())
+#pragma omp begin declare variant match(device = {kind(host)})
+static constexpr bool kokkos_omp_on_host() { return true; }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(nohost)})
+static constexpr bool kokkos_omp_on_host() { return false; }
+#pragma omp end declare variant
+
+#define KOKKOS_IMPL_IF_ON_HOST if constexpr (kokkos_omp_on_host())
 #endif
 #else
 #define KOKKOS_IMPL_IF_ON_HOST if (true)
@@ -66,6 +79,9 @@ namespace Impl {
 template <class MemorySpace = void, class DestroyFunctor = void>
 class SharedAllocationRecord;
 
+template <class MemorySpace>
+class SharedAllocationRecordCommon;
+
 class SharedAllocationHeader {
  private:
   using Record = SharedAllocationRecord<void, void>;
@@ -75,6 +91,10 @@ class SharedAllocationHeader {
 
   template <class, class>
   friend class SharedAllocationRecord;
+  template <class>
+  friend class SharedAllocationRecordCommon;
+  template <class>
+  friend class HostInaccessibleSharedAllocationRecordCommon;
 
   Record* m_record;
   char m_label[maximum_label_length];
@@ -99,6 +119,10 @@ class SharedAllocationRecord<void, void> {
 
   template <class, class>
   friend class SharedAllocationRecord;
+  template <class>
+  friend class SharedAllocationRecordCommon;
+  template <class>
+  friend class HostInaccessibleSharedAllocationRecordCommon;
 
   using function_type = void (*)(SharedAllocationRecord<void, void>*);
 
@@ -229,6 +253,58 @@ class SharedAllocationRecord<void, void> {
       const SharedAllocationRecord* const root, const bool detail);
 };
 
+template <class MemorySpace>
+class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> {
+ private:
+  using derived_t     = SharedAllocationRecord<MemorySpace, void>;
+  using record_base_t = SharedAllocationRecord<void, void>;
+  derived_t& self() { return *static_cast<derived_t*>(this); }
+  derived_t const& self() const { return *static_cast<derived_t const*>(this); }
+
+ protected:
+  using record_base_t::record_base_t;
+
+  void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header,
+                                         std::string const& arg_label);
+
+  static void deallocate(record_base_t* arg_rec);
+
+ public:
+  static auto allocate(MemorySpace const& arg_space,
+                       std::string const& arg_label, size_t arg_alloc_size)
+      -> derived_t*;
+  /**\brief  Allocate tracked memory in the space */
+  static void* allocate_tracked(MemorySpace const& arg_space,
+                                std::string const& arg_alloc_label,
+                                size_t arg_alloc_size);
+  /**\brief  Reallocate tracked memory in the space */
+  static void deallocate_tracked(void* arg_alloc_ptr);
+  /**\brief  Deallocate tracked memory in the space */
+  static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size);
+  static auto get_record(void* alloc_ptr) -> derived_t*;
+  std::string get_label() const;
+  static void print_records(std::ostream& s, MemorySpace const&,
+                            bool detail = false);
+};
+
+template <class MemorySpace>
+class HostInaccessibleSharedAllocationRecordCommon
+    : public SharedAllocationRecordCommon<MemorySpace> {
+ private:
+  using base_t        = SharedAllocationRecordCommon<MemorySpace>;
+  using derived_t     = SharedAllocationRecord<MemorySpace, void>;
+  using record_base_t = SharedAllocationRecord<void, void>;
+
+ protected:
+  using base_t::base_t;
+
+ public:
+  static void print_records(std::ostream& s, MemorySpace const&,
+                            bool detail = false);
+  static auto get_record(void* alloc_ptr) -> derived_t*;
+  std::string get_label() const;
+};
+
 namespace {
 
 /* Taking the address of this function so make sure it is unique */
@@ -508,5 +584,4 @@ union SharedAllocationTracker {
 
 } /* namespace Impl */
 } /* namespace Kokkos */
-#undef KOKKOS_IMPL_IF_ON_HOST
 #endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6ee1b3f9eb11ddfbfd2c1ce5dd7a213bd25dda9
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp
@@ -0,0 +1,287 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (12/8/20) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP
+#define KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Core_fwd.hpp>
+
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+#include <Kokkos_HostSpace.hpp>  // used with HostInaccessible specializations
+
+#include <string>    // std::string
+#include <cstring>   // strncpy
+#include <iostream>  // ostream
+
+namespace Kokkos {
+namespace Impl {
+
+template <class MemorySpace>
+auto SharedAllocationRecordCommon<MemorySpace>::allocate(
+    MemorySpace const& arg_space, std::string const& arg_label,
+    size_t arg_alloc_size) -> derived_t* {
+  return new derived_t(arg_space, arg_label, arg_alloc_size);
+}
+
+template <class MemorySpace>
+void* SharedAllocationRecordCommon<MemorySpace>::allocate_tracked(
+    const MemorySpace& arg_space, const std::string& arg_alloc_label,
+    size_t arg_alloc_size) {
+  if (!arg_alloc_size) return nullptr;
+
+  SharedAllocationRecord* const r =
+      allocate(arg_space, arg_alloc_label, arg_alloc_size);
+
+  record_base_t::increment(r);
+
+  return r->data();
+}
+
+template <class MemorySpace>
+void SharedAllocationRecordCommon<MemorySpace>::deallocate(
+    SharedAllocationRecordCommon::record_base_t* arg_rec) {
+  delete static_cast<derived_t*>(arg_rec);
+}
+
+template <class MemorySpace>
+void SharedAllocationRecordCommon<MemorySpace>::deallocate_tracked(
+    void* arg_alloc_ptr) {
+  if (arg_alloc_ptr != nullptr) {
+    SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr);
+    record_base_t::decrement(r);
+  }
+}
+
+template <class MemorySpace>
+void* SharedAllocationRecordCommon<MemorySpace>::reallocate_tracked(
+    void* arg_alloc_ptr, size_t arg_alloc_size) {
+  derived_t* const r_old = derived_t::get_record(arg_alloc_ptr);
+  derived_t* const r_new =
+      allocate(r_old->m_space, r_old->get_label(), arg_alloc_size);
+
+  Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>(
+      r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size()));
+
+  record_base_t::increment(r_new);
+  record_base_t::decrement(r_old);
+
+  return r_new->data();
+}
+
+template <class MemorySpace>
+auto SharedAllocationRecordCommon<MemorySpace>::get_record(void* alloc_ptr)
+    -> derived_t* {
+  using Header = SharedAllocationHeader;
+
+  Header const* const h = alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;
+
+  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
+    Kokkos::Impl::throw_runtime_exception(
+        std::string("Kokkos::Impl::SharedAllocationRecordCommon<") +
+        std::string(MemorySpace::name()) +
+        std::string(">::get_record() ERROR"));
+  }
+
+  return static_cast<derived_t*>(h->m_record);
+}
+
+template <class MemorySpace>
+std::string SharedAllocationRecordCommon<MemorySpace>::get_label() const {
+  return std::string(record_base_t::head()->m_label);
+}
+
+template <class MemorySpace>
+void SharedAllocationRecordCommon<MemorySpace>::
+    _fill_host_accessible_header_info(SharedAllocationHeader& arg_header,
+                                      std::string const& arg_label) {
+  // Fill in the Header information, directly accessible on the host
+
+  arg_header.m_record = &self();
+
+  strncpy(arg_header.m_label, arg_label.c_str(),
+          SharedAllocationHeader::maximum_label_length);
+  // Set last element zero, in case c_str is too long
+  arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0';
+}
+
+template <class MemorySpace>
+void SharedAllocationRecordCommon<MemorySpace>::print_records(
+    std::ostream& s, const MemorySpace&, bool detail) {
+  (void)s;
+  (void)detail;
+#ifdef KOKKOS_ENABLE_DEBUG
+  SharedAllocationRecord<void, void>::print_host_accessible_records(
+      s, MemorySpace::name(), &derived_t::s_root_record, detail);
+#else
+  Kokkos::Impl::throw_runtime_exception(
+      std::string("SharedAllocationHeader<") +
+      std::string(MemorySpace::name()) +
+      std::string(
+          ">::print_records only works with KOKKOS_ENABLE_DEBUG enabled"));
+#endif
+}
+
+template <class MemorySpace>
+void HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::print_records(
+    std::ostream& s, const MemorySpace&, bool detail) {
+  (void)s;
+  (void)detail;
+#ifdef KOKKOS_ENABLE_DEBUG
+  SharedAllocationRecord<void, void>* r = &derived_t::s_root_record;
+
+  char buffer[256];
+
+  SharedAllocationHeader head;
+
+  if (detail) {
+    do {
+      if (r->m_alloc_ptr) {
+        Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>(
+            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
+      } else {
+        head.m_label[0] = 0;
+      }
+
+      // Formatting dependent on sizeof(uintptr_t)
+      const char* format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+        format_string =
+            "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx "
+            "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+        format_string =
+            "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ "
+            "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf(buffer, 256, format_string, MemorySpace::execution_space::name(),
+               reinterpret_cast<uintptr_t>(r),
+               reinterpret_cast<uintptr_t>(r->m_prev),
+               reinterpret_cast<uintptr_t>(r->m_next),
+               reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size,
+               r->m_count, reinterpret_cast<uintptr_t>(r->m_dealloc),
+               head.m_label);
+      s << buffer;
+      r = r->m_next;
+    } while (r != &derived_t::s_root_record);
+  } else {
+    do {
+      if (r->m_alloc_ptr) {
+        Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>(
+            &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader));
+
+        // Formatting dependent on sizeof(uintptr_t)
+        const char* format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+          format_string = "%s [ 0x%.12lx + %ld ] %s\n";
+        } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+          format_string = "%s [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf(
+            buffer, 256, format_string, MemorySpace::execution_space::name(),
+            reinterpret_cast<uintptr_t>(r->data()), r->size(), head.m_label);
+      } else {
+        snprintf(buffer, 256, "%s [ 0 + 0 ]\n",
+                 MemorySpace::execution_space::name());
+      }
+      s << buffer;
+      r = r->m_next;
+    } while (r != &derived_t::s_root_record);
+  }
+#else
+  Kokkos::Impl::throw_runtime_exception(
+      std::string("SharedAllocationHeader<") +
+      std::string(MemorySpace::name()) +
+      std::string(
+          ">::print_records only works with KOKKOS_ENABLE_DEBUG enabled"));
+#endif
+}
+
+template <class MemorySpace>
+auto HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::get_record(
+    void* alloc_ptr) -> derived_t* {
+  // Copy the header from the allocation
+  SharedAllocationHeader head;
+
+  SharedAllocationHeader const* const head_cuda =
+      alloc_ptr ? SharedAllocationHeader::get_header(alloc_ptr) : nullptr;
+
+  if (alloc_ptr) {
+    Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>(
+        &head, head_cuda, sizeof(SharedAllocationHeader));
+  }
+
+  derived_t* const record =
+      alloc_ptr ? static_cast<derived_t*>(head.m_record) : nullptr;
+
+  if (!alloc_ptr || record->m_alloc_ptr != head_cuda) {
+    Kokkos::Impl::throw_runtime_exception(
+        std::string("Kokkos::Impl::SharedAllocationRecord<") +
+        std::string(MemorySpace::name()) +
+        std::string(", void>::get_record ERROR"));
+  }
+
+  return record;
+}
+
+template <class MemorySpace>
+std::string
+HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::get_label() const {
+  SharedAllocationHeader header;
+
+  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, MemorySpace>(
+      &header, this->record_base_t::head(), sizeof(SharedAllocationHeader));
+
+  return std::string(header.m_label);
+}
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 85de7c977c99d2e5d8867ea8dd7d945ae3482c9d..0773a0914befe4e9db3b3b79ae3c446bcb0f3ad1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -225,9 +225,9 @@ class SimpleTaskScheduler
                                      Impl::DefaultDestroy<task_queue_type> >;
 
     // Allocate space for the task queue
-    auto* record =
-        record_type::allocate(memory_space(), "TaskQueue", allocation_size);
-    m_queue = new (record->data())
+    auto* record = record_type::allocate(memory_space(), "Kokkos::TaskQueue",
+                                         allocation_size);
+    m_queue      = new (record->data())
         task_queue_type(arg_execution_space, arg_memory_space, arg_memory_pool);
     record->m_destroy.managed_object = m_queue;
     m_track.assign_allocated_record_to_uninitialized(record);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
index 2ae5c7863c4d25ee70769cf51e8df83e50a74861..8ac034e249f1c1d1a4309003ee77c0cbe38682de 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@@ -49,12 +49,8 @@
 #include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_BitOps.hpp>
 
-#if defined(KOKKOS_ENABLE_STDTHREAD) || defined(_WIN32)
 #include <thread>
-#elif !defined(_WIN32)
-#include <sched.h>
-#include <time.h>
-#else
+#if defined(_WIN32)
 #include <process.h>
 #include <winsock2.h>
 #include <windows.h>
@@ -73,28 +69,14 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) {
 
   if (WaitMode::ROOT != mode) {
     if (sleep_limit < i) {
-      // Attempt to put the thread to sleep for 'c' milliseconds
-
-#if defined(KOKKOS_ENABLE_STDTHREAD) || defined(_WIN32)
-      auto start = std::chrono::high_resolution_clock::now();
+      // Attempt to put the thread to sleep for 'c' microseconds
       std::this_thread::yield();
-      std::this_thread::sleep_until(start + std::chrono::nanoseconds(c * 1000));
-#else
-      timespec req;
-      req.tv_sec  = 0;
-      req.tv_nsec = 1000 * c;
-      nanosleep(&req, nullptr);
-#endif
+      std::this_thread::sleep_for(std::chrono::microseconds(c));
     }
 
     else if (mode == WaitMode::PASSIVE || yield_limit < i) {
       // Attempt to yield thread resources to runtime
-
-#if defined(KOKKOS_ENABLE_STDTHREAD) || defined(_WIN32)
       std::this_thread::yield();
-#else
-      sched_yield();
-#endif
     }
 #if defined(KOKKOS_ENABLE_ASM)
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
index ed0ede86e0bc3638e5e9458caa5f10d83578ee93..d88230f5b247829dbf6e8ee79b111cb2d1309118 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -147,13 +147,6 @@ struct are_integral<T, Args...> {
 namespace Kokkos {
 namespace Impl {
 
-//----------------------------------------------------------------------------
-
-template <class, class T = void>
-struct enable_if_type {
-  using type = T;
-};
-
 //----------------------------------------------------------------------------
 // if_
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
index facc8ba5b08d779ebd5053d5a952796e069ef9ef..cb8cf281ae06fe0a71862b47428a2ffa12f4bd67 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -49,6 +49,7 @@
 #include <cstdint>
 #include <type_traits>
 #include <initializer_list>  // in-order comma operator fold emulation
+#include <utility>           // integer_sequence and friends
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -56,338 +57,6 @@
 namespace Kokkos {
 namespace Impl {
 
-//----------------------------------------
-// C++14 integer sequence
-template <typename T, T... Ints>
-struct integer_sequence {
-  using value_type = T;
-  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
-};
-
-template <typename T, std::size_t N>
-struct make_integer_sequence_helper;
-
-template <typename T, T N>
-using make_integer_sequence = typename make_integer_sequence_helper<T, N>::type;
-
-template <typename T>
-struct make_integer_sequence_helper<T, 0> {
-  using type = integer_sequence<T>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 1> {
-  using type = integer_sequence<T, 0>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 2> {
-  using type = integer_sequence<T, 0, 1>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 3> {
-  using type = integer_sequence<T, 0, 1, 2>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 4> {
-  using type = integer_sequence<T, 0, 1, 2, 3>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 5> {
-  using type = integer_sequence<T, 0, 1, 2, 3, 4>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 6> {
-  using type = integer_sequence<T, 0, 1, 2, 3, 4, 5>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 7> {
-  using type = integer_sequence<T, 0, 1, 2, 3, 4, 5, 6>;
-};
-
-template <typename T>
-struct make_integer_sequence_helper<T, 8> {
-  using type = integer_sequence<T, 0, 1, 2, 3, 4, 5, 6, 7>;
-};
-
-template <typename X, typename Y>
-struct make_integer_sequence_concat;
-
-template <typename T, T... x, T... y>
-struct make_integer_sequence_concat<integer_sequence<T, x...>,
-                                    integer_sequence<T, y...>> {
-  using type = integer_sequence<T, x..., (sizeof...(x) + y)...>;
-};
-
-template <typename T, std::size_t N>
-struct make_integer_sequence_helper {
-  using type = typename make_integer_sequence_concat<
-      typename make_integer_sequence_helper<T, N / 2>::type,
-      typename make_integer_sequence_helper<T, N - N / 2>::type>::type;
-};
-
-//----------------------------------------
-
-template <std::size_t... Indices>
-using index_sequence = integer_sequence<std::size_t, Indices...>;
-
-template <std::size_t N>
-using make_index_sequence = make_integer_sequence<std::size_t, N>;
-
-//----------------------------------------
-
-template <unsigned I, typename IntegerSequence>
-struct integer_sequence_at;
-
-template <unsigned I, typename T, T h0, T... tail>
-struct integer_sequence_at<I, integer_sequence<T, h0, tail...>>
-    : public integer_sequence_at<I - 1u, integer_sequence<T, tail...>> {
-  static_assert(8 <= I, "Reasoning Error");
-  static_assert(I < integer_sequence<T, h0, tail...>::size(),
-                "Error: Index out of bounds");
-};
-
-template <typename T, T h0, T... tail>
-struct integer_sequence_at<0u, integer_sequence<T, h0, tail...>> {
-  using type               = T;
-  static constexpr T value = h0;
-};
-
-template <typename T, T h0, T h1, T... tail>
-struct integer_sequence_at<1u, integer_sequence<T, h0, h1, tail...>> {
-  using type               = T;
-  static constexpr T value = h1;
-};
-
-template <typename T, T h0, T h1, T h2, T... tail>
-struct integer_sequence_at<2u, integer_sequence<T, h0, h1, h2, tail...>> {
-  using type               = T;
-  static constexpr T value = h2;
-};
-
-template <typename T, T h0, T h1, T h2, T h3, T... tail>
-struct integer_sequence_at<3u, integer_sequence<T, h0, h1, h2, h3, tail...>> {
-  using type               = T;
-  static constexpr T value = h3;
-};
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T... tail>
-struct integer_sequence_at<4u,
-                           integer_sequence<T, h0, h1, h2, h3, h4, tail...>> {
-  using type               = T;
-  static constexpr T value = h4;
-};
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T... tail>
-struct integer_sequence_at<
-    5u, integer_sequence<T, h0, h1, h2, h3, h4, h5, tail...>> {
-  using type               = T;
-  static constexpr T value = h5;
-};
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T... tail>
-struct integer_sequence_at<
-    6u, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, tail...>> {
-  using type               = T;
-  static constexpr T value = h6;
-};
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T h7, T... tail>
-struct integer_sequence_at<
-    7u, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, h7, tail...>> {
-  using type               = T;
-  static constexpr T value = h7;
-};
-
-//----------------------------------------
-
-template <typename T>
-constexpr T at(const unsigned, integer_sequence<T>) noexcept {
-  return ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T... tail>
-constexpr T at(const unsigned i, integer_sequence<T, h0>) noexcept {
-  return i == 0u ? h0 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1>
-constexpr T at(const unsigned i, integer_sequence<T, h0, h1>) noexcept {
-  return i == 0u ? h0 : i == 1u ? h1 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2>
-constexpr T at(const unsigned i, integer_sequence<T, h0, h1, h2>) noexcept {
-  return i == 0u ? h0 : i == 1u ? h1 : i == 2u ? h2 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2, T h3>
-constexpr T at(const unsigned i, integer_sequence<T, h0, h1, h2, h3>) noexcept {
-  return i == 0u
-             ? h0
-             : i == 1u ? h1 : i == 2u ? h2 : i == 3u ? h3 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2, T h3, T h4>
-constexpr T at(const unsigned i,
-               integer_sequence<T, h0, h1, h2, h3, h4>) noexcept {
-  return i == 0u
-             ? h0
-             : i == 1u
-                   ? h1
-                   : i == 2u ? h2
-                             : i == 3u ? h3 : i == 4u ? h4 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5>
-constexpr T at(const unsigned i,
-               integer_sequence<T, h0, h1, h2, h3, h4, h5>) noexcept {
-  return i == 0u
-             ? h0
-             : i == 1u
-                   ? h1
-                   : i == 2u ? h2
-                             : i == 3u ? h3
-                                       : i == 4u ? h4
-                                                 : i == 5u ? h5
-                                                           : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6>
-constexpr T at(const unsigned i,
-               integer_sequence<T, h0, h1, h2, h3, h4, h5, h6>) noexcept {
-  return i == 0u
-             ? h0
-             : i == 1u
-                   ? h1
-                   : i == 2u
-                         ? h2
-                         : i == 3u
-                               ? h3
-                               : i == 4u
-                                     ? h4
-                                     : i == 5u
-                                           ? h5
-                                           : i == 6u ? h6 : ~static_cast<T>(0);
-}
-
-template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T h7, T... tail>
-constexpr T at(
-    const unsigned i,
-    integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, h7, tail...>) noexcept {
-  return i == 0u
-             ? h0
-             : i == 1u
-                   ? h1
-                   : i == 2u
-                         ? h2
-                         : i == 3u
-                               ? h3
-                               : i == 4u
-                                     ? h4
-                                     : i == 5u
-                                           ? h5
-                                           : i == 6u
-                                                 ? h6
-                                                 : i == 7u
-                                                       ? h7
-                                                       : at(i - 8u,
-                                                            integer_sequence<
-                                                                T, tail...>{});
-}
-
-//----------------------------------------
-
-template <typename IntegerSequence,
-          typename ResultSequence =
-              integer_sequence<typename IntegerSequence::value_type>>
-struct reverse_integer_sequence_helper;
-
-template <typename T, T h0, T... tail, T... results>
-struct reverse_integer_sequence_helper<integer_sequence<T, h0, tail...>,
-                                       integer_sequence<T, results...>>
-    : public reverse_integer_sequence_helper<
-          integer_sequence<T, tail...>, integer_sequence<T, h0, results...>> {};
-
-template <typename T, T... results>
-struct reverse_integer_sequence_helper<integer_sequence<T>,
-                                       integer_sequence<T, results...>> {
-  using type = integer_sequence<T, results...>;
-};
-
-template <typename IntegerSequence>
-using reverse_integer_sequence =
-    typename reverse_integer_sequence_helper<IntegerSequence>::type;
-
-//----------------------------------------
-
-template <typename IntegerSequence, typename Result,
-          typename ResultSequence =
-              integer_sequence<typename IntegerSequence::value_type>>
-struct exclusive_scan_integer_sequence_helper;
-
-template <typename T, T h0, T... tail, typename Result, T... results>
-struct exclusive_scan_integer_sequence_helper<
-    integer_sequence<T, h0, tail...>, Result, integer_sequence<T, results...>>
-    : public exclusive_scan_integer_sequence_helper<
-          integer_sequence<T, tail...>,
-          std::integral_constant<T, Result::value + h0>,
-          integer_sequence<T, 0, (results + h0)...>> {};
-
-template <typename T, typename Result, T... results>
-struct exclusive_scan_integer_sequence_helper<integer_sequence<T>, Result,
-                                              integer_sequence<T, results...>> {
-  using type               = integer_sequence<T, results...>;
-  static constexpr T value = Result::value;
-};
-
-template <typename IntegerSequence>
-struct exclusive_scan_integer_sequence {
-  using value_type = typename IntegerSequence::value_type;
-  using helper     = exclusive_scan_integer_sequence_helper<
-      reverse_integer_sequence<IntegerSequence>,
-      std::integral_constant<value_type, 0>>;
-  using type                        = typename helper::type;
-  static constexpr value_type value = helper::value;
-};
-
-//----------------------------------------
-
-template <typename IntegerSequence, typename Result,
-          typename ResultSequence =
-              integer_sequence<typename IntegerSequence::value_type>>
-struct inclusive_scan_integer_sequence_helper;
-
-template <typename T, T h0, T... tail, typename Result, T... results>
-struct inclusive_scan_integer_sequence_helper<
-    integer_sequence<T, h0, tail...>, Result, integer_sequence<T, results...>>
-    : public inclusive_scan_integer_sequence_helper<
-          integer_sequence<T, tail...>,
-          std::integral_constant<T, Result::value + h0>,
-          integer_sequence<T, h0, (results + h0)...>> {};
-
-template <typename T, typename Result, T... results>
-struct inclusive_scan_integer_sequence_helper<integer_sequence<T>, Result,
-                                              integer_sequence<T, results...>> {
-  using type               = integer_sequence<T, results...>;
-  static constexpr T value = Result::value;
-};
-
-template <typename IntegerSequence>
-struct inclusive_scan_integer_sequence {
-  using value_type = typename IntegerSequence::value_type;
-  using helper     = inclusive_scan_integer_sequence_helper<
-      reverse_integer_sequence<IntegerSequence>,
-      std::integral_constant<value_type, 0>>;
-  using type                        = typename helper::type;
-  static constexpr value_type value = helper::value;
-};
-
 template <typename T>
 struct identity {
   using type = T;
@@ -396,6 +65,21 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
+struct not_a_type {
+  not_a_type()                  = delete;
+  ~not_a_type()                 = delete;
+  not_a_type(not_a_type const&) = delete;
+  void operator=(not_a_type const&) = delete;
+};
+
+#if defined(__cpp_lib_void_t)
+// since C++17
+using std::void_t;
+#else
+template <class...>
+using void_t = void;
+#endif
+
 //==============================================================================
 // <editor-fold desc="remove_cvref_t"> {{{1
 
@@ -467,6 +151,16 @@ struct destruct_delete {
 };
 //==============================================================================
 
+//==============================================================================
+// <editor-fold desc="type_list"> {{{1
+
+// An intentionally uninstantiateable type_list for metaprogramming purposes
+template <class...>
+struct type_list;
+
+// </editor-fold> end type_list }}}1
+//==============================================================================
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
index 267b35f66584c96b4115486aa455bea6e2fb7ae2..6915622352e47d25efa34ae687f3e4f190150974 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
@@ -122,47 +122,47 @@ struct ViewOffset<
                              is_array_layout<Layout>::value &&
                              is_array_layout_tiled<Layout>::value)>::type> {
  public:
-  //  enum { outer_pattern = Layout::outer_pattern };
-  //  enum { inner_pattern = Layout::inner_pattern };
   static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern;
   static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern;
 
-  enum { VORank = Dimension::rank };
-
-  enum : unsigned { SHIFT_0 = Kokkos::Impl::integral_power_of_two(Layout::N0) };
-  enum : unsigned { SHIFT_1 = Kokkos::Impl::integral_power_of_two(Layout::N1) };
-  enum : unsigned { SHIFT_2 = Kokkos::Impl::integral_power_of_two(Layout::N2) };
-  enum : unsigned { SHIFT_3 = Kokkos::Impl::integral_power_of_two(Layout::N3) };
-  enum : unsigned { SHIFT_4 = Kokkos::Impl::integral_power_of_two(Layout::N4) };
-  enum : unsigned { SHIFT_5 = Kokkos::Impl::integral_power_of_two(Layout::N5) };
-  enum : unsigned { SHIFT_6 = Kokkos::Impl::integral_power_of_two(Layout::N6) };
-  enum : unsigned { SHIFT_7 = Kokkos::Impl::integral_power_of_two(Layout::N7) };
-  enum { MASK_0 = Layout::N0 - 1 };
-  enum { MASK_1 = Layout::N1 - 1 };
-  enum { MASK_2 = Layout::N2 - 1 };
-  enum { MASK_3 = Layout::N3 - 1 };
-  enum { MASK_4 = Layout::N4 - 1 };
-  enum { MASK_5 = Layout::N5 - 1 };
-  enum { MASK_6 = Layout::N6 - 1 };
-  enum { MASK_7 = Layout::N7 - 1 };
-
-  enum : unsigned { SHIFT_2T = SHIFT_0 + SHIFT_1 };
-  enum : unsigned { SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2 };
-  enum : unsigned { SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 };
-  enum : unsigned {
-    SHIFT_5T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4
-  };
-  enum : unsigned {
-    SHIFT_6T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5
-  };
-  enum : unsigned {
-    SHIFT_7T =
-        SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6
-  };
-  enum : unsigned {
-    SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 +
-               SHIFT_6 + SHIFT_7
-  };
+  static constexpr int VORank = Dimension::rank;
+
+  static constexpr unsigned SHIFT_0 =
+      Kokkos::Impl::integral_power_of_two(Layout::N0);
+  static constexpr unsigned SHIFT_1 =
+      Kokkos::Impl::integral_power_of_two(Layout::N1);
+  static constexpr unsigned SHIFT_2 =
+      Kokkos::Impl::integral_power_of_two(Layout::N2);
+  static constexpr unsigned SHIFT_3 =
+      Kokkos::Impl::integral_power_of_two(Layout::N3);
+  static constexpr unsigned SHIFT_4 =
+      Kokkos::Impl::integral_power_of_two(Layout::N4);
+  static constexpr unsigned SHIFT_5 =
+      Kokkos::Impl::integral_power_of_two(Layout::N5);
+  static constexpr unsigned SHIFT_6 =
+      Kokkos::Impl::integral_power_of_two(Layout::N6);
+  static constexpr unsigned SHIFT_7 =
+      Kokkos::Impl::integral_power_of_two(Layout::N7);
+  static constexpr int MASK_0 = Layout::N0 - 1;
+  static constexpr int MASK_1 = Layout::N1 - 1;
+  static constexpr int MASK_2 = Layout::N2 - 1;
+  static constexpr int MASK_3 = Layout::N3 - 1;
+  static constexpr int MASK_4 = Layout::N4 - 1;
+  static constexpr int MASK_5 = Layout::N5 - 1;
+  static constexpr int MASK_6 = Layout::N6 - 1;
+  static constexpr int MASK_7 = Layout::N7 - 1;
+
+  static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1;
+  static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2;
+  static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3;
+  static constexpr unsigned SHIFT_5T =
+      SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4;
+  static constexpr unsigned SHIFT_6T =
+      SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5;
+  static constexpr unsigned SHIFT_7T =
+      SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6;
+  static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 +
+                                       SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7;
 
   // Is an irregular layout that does not have uniform striding for each index.
   using is_mapping_plugin = std::true_type;
@@ -659,6 +659,91 @@ struct ViewOffset<
                                : 0) {}
 };
 
+// FIXME Remove the out-of-class definitions when we require C++17
+#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE                                      \
+  typename std::enable_if<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \
+                           is_array_layout<Layout>::value &&                   \
+                           is_array_layout_tiled<Layout>::value)>::type
+template <class Dimension, class Layout>
+constexpr Kokkos::Iterate ViewOffset<
+    Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern;
+template <class Dimension, class Layout>
+constexpr Kokkos::Iterate ViewOffset<
+    Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::VORank;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_0;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_1;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_0;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_1;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_2;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_3;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_4;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_5;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_6;
+template <class Dimension, class Layout>
+constexpr int
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_7;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7T;
+template <class Dimension, class Layout>
+constexpr unsigned
+    ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_8T;
+#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE
+
 //----------------------------------------
 
 // ViewMapping assign method needed in order to return a 'subview' tile as a
@@ -687,8 +772,8 @@ class ViewMapping<
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T**, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -739,8 +824,8 @@ class ViewMapping<typename std::enable_if<(N3 == 0 && N4 == 0 && N5 == 0 &&
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T***, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -797,8 +882,8 @@ class ViewMapping<typename std::enable_if<(N4 == 0 && N5 == 0 && N6 == 0 &&
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T****, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -860,8 +945,8 @@ class ViewMapping<
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T*****, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -928,8 +1013,8 @@ class ViewMapping<typename std::enable_if<(N6 == 0 && N7 == 0)>::type  // void
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T******, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -1002,8 +1087,8 @@ class ViewMapping<typename std::enable_if<(N7 == 0)>::type  // void
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T*******, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
@@ -1085,8 +1170,8 @@ class ViewMapping<typename std::enable_if<(N0 != 0 && N1 != 0 && N2 != 0 &&
                                         N6, N7, true>;
   using src_traits = Kokkos::ViewTraits<T********, src_layout, P...>;
 
-  enum { is_outer_left = (OuterP == Kokkos::Iterate::Left) };
-  enum { is_inner_left = (InnerP == Kokkos::Iterate::Left) };
+  static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
+  static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
   using array_layout =
       typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
                                 Kokkos::LayoutRight>::type;
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index d72b802511d8147cd4caff158e3c335b49aaaaf2..a380a306931f4150e95b6f433c8bb076b091c456 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -2684,7 +2684,7 @@ struct ViewDataHandle<
   template <class SrcHandleType>
   KOKKOS_INLINE_FUNCTION static handle_type assign(
       const SrcHandleType& arg_handle, size_t offset) {
-    return handle_type(arg_handle.ptr + offset);
+    return handle_type(arg_handle + offset);
   }
 };
 
@@ -3846,8 +3846,8 @@ template <class T, class Enable = void>
 struct has_printable_label_typedef : public std::false_type {};
 
 template <class T>
-struct has_printable_label_typedef<
-    T, typename enable_if_type<typename T::printable_label_typedef>::type>
+struct has_printable_label_typedef<T,
+                                   void_t<typename T::printable_label_typedef>>
     : public std::true_type {};
 
 template <class MapType>
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5406746befc984f17f815e04bac63f0fadff4
--- /dev/null
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SETUP_SYCL_HPP_
+#define KOKKOS_SETUP_SYCL_HPP_
+
+#include <CL/sycl.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+#ifdef KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF
+namespace Kokkos {
+namespace ImplSYCL {
+template <typename... Args>
+void sink(Args&&... args) {
+  (void)(sizeof...(args));
+}
+}  // namespace ImplSYCL
+}  // namespace Kokkos
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) \
+  do {                                     \
+    Kokkos::ImplSYCL::sink(__VA_ARGS__);   \
+  } while (0)
+#else
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                       \
+  do {                                                                   \
+    static const __attribute__((opencl_constant)) char fmt[] = (format); \
+    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);              \
+  } while (0)
+#endif
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4467b2e03c486d07d80c3fee66e6c3b50c42256e
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP
+#define KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // is_execution_space
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
+
+struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
+  struct base_traits {
+    static constexpr auto execution_space_is_defaulted = true;
+
+    using execution_space = Kokkos::DefaultExecutionSpace;
+  };
+  template <class T>
+  using trait_matches_specification = is_execution_space<T>;
+};
+
+// </editor-fold> end trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+template <class ExecutionSpace, class... Traits>
+struct AnalyzeExecPolicy<
+    std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>,
+    ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+
+  static_assert(base_t::execution_space_is_defaulted,
+                "Kokkos Error: More than one execution space given");
+
+  static constexpr bool execution_space_is_defaulted = false;
+
+  using execution_space = ExecutionSpace;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP
diff --git a/packages/kokkos/algorithms/unit_tests/TestHPX.cpp b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
similarity index 58%
rename from packages/kokkos/algorithms/unit_tests/TestHPX.cpp
rename to packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
index 2981e97945cb45065452a8f5330b0b35a9f4c65c..eb649dc0887a2aab8c88feae8156676b70a7cdf7 100644
--- a/packages/kokkos/algorithms/unit_tests/TestHPX.cpp
+++ b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
@@ -42,46 +42,46 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP
+#define KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP
+
 #include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_HPX
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <impl/Kokkos_GraphImpl_fwd.hpp>  // IsGraphKernelTag
+#include <traits/Kokkos_Traits_fwd.hpp>
+#include <impl/Kokkos_Utilities.hpp>
 
-#include <gtest/gtest.h>
-#include <Kokkos_Core.hpp>
+namespace Kokkos {
+namespace Impl {
 
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
 
-namespace Test {
+struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
+  struct base_traits {
+    using is_graph_kernel = std::false_type;
+  };
+  template <class T>
+  using trait_matches_specification = std::is_same<T, IsGraphKernelTag>;
+};
 
-#define HPX_RANDOM_XORSHIFT64(num_draws)                             \
-  TEST(hpx, Random_XorShift64) {                                     \
-    Impl::test_random<                                               \
-        Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HPX> >( \
-        num_draws);                                                  \
-  }
+// </editor-fold> end trait specification }}}1
+//==============================================================================
 
-#define HPX_RANDOM_XORSHIFT1024(num_draws)                             \
-  TEST(hpx, Random_XorShift1024) {                                     \
-    Impl::test_random<                                                 \
-        Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HPX> >( \
-        num_draws);                                                    \
-  }
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
 
-#define HPX_SORT_UNSIGNED(size)                                 \
-  TEST(hpx, SortUnsigned) {                                     \
-    Impl::test_sort<Kokkos::Experimental::HPX, unsigned>(size); \
-  }
+template <class... Traits>
+struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  using is_graph_kernel = std::true_type;
+};
 
-HPX_RANDOM_XORSHIFT64(10240000)
-HPX_RANDOM_XORSHIFT1024(10130144)
-HPX_SORT_UNSIGNED(171)
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+}  // end namespace Impl
+}  // end namespace Kokkos
 
-#undef HPX_RANDOM_XORSHIFT64
-#undef HPX_RANDOM_XORSHIFT1024
-#undef HPX_SORT_UNSIGNED
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTHPX_PREVENT_LINK_ERROR() {}
-#endif
+#endif  // KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e15adc17116cb66481f90acc0b9ba5a83ec1ab52
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
+#define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // IndexType, is_index_type
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
+
+struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> {
+  struct base_traits {
+    static constexpr bool index_type_is_defaulted = true;
+    using index_type = dependent_policy_trait_default;
+  };
+  template <class T>
+  using trait_matches_specification =
+      std::integral_constant<bool, std::is_integral<T>::value ||
+                                       is_index_type<T>::value>;
+};
+
+// </editor-fold> end trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+// Index type given as IndexType template
+template <class IntegralIndexType, class... Traits>
+struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(base_t::index_type_is_defaulted,
+                "Kokkos Error: More than one index type given");
+  static constexpr bool index_type_is_defaulted = false;
+  using index_type = Kokkos::IndexType<IntegralIndexType>;
+};
+
+// IndexType given as an integral type directly
+template <class IntegralIndexType, class... Traits>
+struct AnalyzeExecPolicy<
+    std::enable_if_t<std::is_integral<IntegralIndexType>::value>,
+    IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(base_t::index_type_is_defaulted,
+                "Kokkos Error: More than one index type given");
+  static constexpr bool index_type_is_defaulted = false;
+  using index_type = Kokkos::IndexType<IntegralIndexType>;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
diff --git a/packages/kokkos/algorithms/unit_tests/TestSerial.cpp b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
similarity index 54%
rename from packages/kokkos/algorithms/unit_tests/TestSerial.cpp
rename to packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
index 2eacdc2677184988c226bbaca13827a4b55cccff..30e07039a405d61f2c78217284f9036a0a533f06 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSerial.cpp
+++ b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
@@ -42,47 +42,47 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_SERIAL
+#ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
+#define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 
-#include <gtest/gtest.h>
+#include <Kokkos_Concepts.hpp>  // is_iteration_pattern
+#include <type_traits>          // is_void
 
-#include <Kokkos_Core.hpp>
+namespace Kokkos {
+namespace Impl {
 
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
 
-//----------------------------------------------------------------------------
+struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> {
+  struct base_traits {
+    using iteration_pattern = void;  // TODO set default iteration pattern
+  };
+  template <class T>
+  using trait_matches_specification = is_iteration_pattern<T>;
+};
 
-namespace Test {
+// </editor-fold> end trait specification }}}1
+//==============================================================================
 
-#define SERIAL_RANDOM_XORSHIFT64(num_draws)                             \
-  TEST(serial, Random_XorShift64) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Serial> >( \
-        num_draws);                                                     \
-  }
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
 
-#define SERIAL_RANDOM_XORSHIFT1024(num_draws)                             \
-  TEST(serial, Random_XorShift1024) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Serial> >( \
-        num_draws);                                                       \
-  }
+template <class IterationPattern, class... Traits>
+struct AnalyzeExecPolicy<
+    std::enable_if_t<is_iteration_pattern<IterationPattern>::value>,
+    IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(std::is_void<typename base_t::iteration_pattern>::value,
+                "Kokkos Error: More than one iteration pattern given");
+  using iteration_pattern = IterationPattern;
+};
 
-#define SERIAL_SORT_UNSIGNED(size)                   \
-  TEST(serial, SortUnsigned) {                       \
-    Impl::test_sort<Kokkos::Serial, unsigned>(size); \
-  }
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
 
-SERIAL_RANDOM_XORSHIFT64(10240000)
-SERIAL_RANDOM_XORSHIFT1024(10130144)
-SERIAL_SORT_UNSIGNED(171)
+}  // end namespace Impl
+}  // end namespace Kokkos
 
-#undef SERIAL_RANDOM_XORSHIFT64
-#undef SERIAL_RANDOM_XORSHIFT1024
-#undef SERIAL_SORT_UNSIGNED
-
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTSERIAL_PREVENT_LINK_ERROR() {}
-#endif  // KOKKOS_ENABLE_SERIAL
+#endif  // KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
diff --git a/packages/kokkos/algorithms/unit_tests/TestCuda.cpp b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
similarity index 54%
rename from packages/kokkos/algorithms/unit_tests/TestCuda.cpp
rename to packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
index 86cee61f64f31800515e25a6052e8ac599ee423e..73ae8e27e2eca54412b4cbab464b1760c93d7aed 100644
--- a/packages/kokkos/algorithms/unit_tests/TestCuda.cpp
+++ b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
@@ -42,51 +42,50 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_CUDA
-
-#include <cstdint>
-#include <iostream>
-#include <iomanip>
+#ifndef KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP
+#define KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP
 
-#include <gtest/gtest.h>
-
-#include <Kokkos_Core.hpp>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // LaunchBounds
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
 
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
+namespace Kokkos {
+namespace Impl {
 
-namespace Test {
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
 
-void cuda_test_random_xorshift64(int num_draws) {
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda>>(num_draws);
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<
-      Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>>>(num_draws);
-}
+struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> {
+  struct base_traits {
+    static constexpr bool launch_bounds_is_defaulted = true;
 
-void cuda_test_random_xorshift1024(int num_draws) {
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda>>(num_draws);
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<
-      Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>>>(num_draws);
-}
+    using launch_bounds = LaunchBounds<>;
+  };
+  template <class T>
+  using trait_matches_specification = is_launch_bounds<T>;
+};
 
-#define CUDA_RANDOM_XORSHIFT64(num_draws) \
-  TEST(cuda, Random_XorShift64) { cuda_test_random_xorshift64(num_draws); }
+// </editor-fold> end trait specification }}}1
+//==============================================================================
 
-#define CUDA_RANDOM_XORSHIFT1024(num_draws) \
-  TEST(cuda, Random_XorShift1024) { cuda_test_random_xorshift1024(num_draws); }
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
 
-#define CUDA_SORT_UNSIGNED(size) \
-  TEST(cuda, SortUnsigned) { Impl::test_sort<Kokkos::Cuda, unsigned>(size); }
+template <unsigned int MaxT, unsigned int MinB, class... Traits>
+struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(base_t::launch_bounds_is_defaulted,
+                "Kokkos Error: More than one launch_bounds given");
+  static constexpr bool launch_bounds_is_defaulted = false;
+  using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>;
+};
 
-CUDA_RANDOM_XORSHIFT64(132141141)
-CUDA_RANDOM_XORSHIFT1024(52428813)
-CUDA_SORT_UNSIGNED(171)
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+}  // end namespace Impl
+}  // end namespace Kokkos
 
-#undef CUDA_RANDOM_XORSHIFT64
-#undef CUDA_RANDOM_XORSHIFT1024
-#undef CUDA_SORT_UNSIGNED
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTCUDA_PREVENT_LINK_ERROR() {}
-#endif /* #ifdef KOKKOS_ENABLE_CUDA */
+#endif  // KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3deb4a94d54ddeee0a6a0712f107d61674818668
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP
+#define KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP
+
+#include <impl/Kokkos_Error.hpp>  // KOKKOS_EXPECTS macro
+
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+
+namespace Experimental {
+
+//==============================================================================
+// <editor-fold desc="Occupancy control user interface"> {{{1
+
+struct MaximizeOccupancy;
+
+struct DesiredOccupancy {
+  int m_occ = 100;
+  explicit constexpr DesiredOccupancy(int occ) : m_occ(occ) {
+    KOKKOS_EXPECTS(0 <= occ && occ <= 100);
+  }
+  explicit constexpr operator int() const { return m_occ; }
+  constexpr int value() const { return m_occ; }
+  DesiredOccupancy() = default;
+  explicit DesiredOccupancy(MaximizeOccupancy const&) : DesiredOccupancy() {}
+};
+
+struct MaximizeOccupancy {
+  explicit MaximizeOccupancy() = default;
+};
+
+// </editor-fold> end Occupancy control user interface }}}1
+//==============================================================================
+
+}  // end namespace Experimental
+
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="Occupancy control trait specification"> {{{1
+
+struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
+  struct base_traits {
+    using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
+    static constexpr bool experimental_contains_desired_occupancy = false;
+    // Default access occupancy_control, for when it is the (stateless) default
+    static constexpr occupancy_control impl_get_occupancy_control() {
+      return occupancy_control{};
+    }
+  };
+  template <class T>
+  using trait_matches_specification = std::integral_constant<
+      bool,
+      std::is_same<T, Kokkos::Experimental::DesiredOccupancy>::value ||
+          std::is_same<T, Kokkos::Experimental::MaximizeOccupancy>::value>;
+};
+
+// </editor-fold> end Occupancy control trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+// The DesiredOccupancy case has runtime storage, so we need to handle copies
+// and assignments
+template <class... Traits>
+struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
+                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
+ public:
+  using base_t            = AnalyzeExecPolicy<void, Traits...>;
+  using occupancy_control = Kokkos::Experimental::DesiredOccupancy;
+  static constexpr bool experimental_contains_desired_occupancy = true;
+
+  template <class OccControl>
+  using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>;
+
+  // Treat this as private, but make it public so that MSVC will still treat
+  // this as a standard layout class and make it the right size: storage for a
+  // stateful desired occupancy
+  //   private:
+  occupancy_control m_desired_occupancy;
+
+  AnalyzeExecPolicy() = default;
+  // Converting constructor
+  // Just rely on the convertibility of occupancy_control to transfer the data
+  template <class Other>
+  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other)
+      : base_t(other),
+        m_desired_occupancy(other.impl_get_occupancy_control()) {}
+
+  // Converting assignment operator
+  // Just rely on the convertibility of occupancy_control to transfer the data
+  template <class Other>
+  AnalyzeExecPolicy& operator=(
+      ExecPolicyTraitsWithDefaults<Other> const& other) {
+    *static_cast<base_t*>(this) = other;
+    this->impl_set_desired_occupancy(
+        occupancy_control{other.impl_get_occupancy_control()});
+    return *this;
+  }
+
+  // Access to occupancy control instance, usable in generic context
+  constexpr occupancy_control impl_get_occupancy_control() const {
+    return m_desired_occupancy;
+  }
+
+  // Access to desired occupancy (getter and setter)
+  Kokkos::Experimental::DesiredOccupancy impl_get_desired_occupancy() const {
+    return m_desired_occupancy;
+  }
+
+  void impl_set_desired_occupancy(occupancy_control desired_occupancy) {
+    m_desired_occupancy = desired_occupancy;
+  }
+};
+
+template <class... Traits>
+struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy,
+                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
+  static constexpr bool experimental_contains_desired_occupancy = false;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+}  // end namespace Impl
+
+namespace Experimental {
+
+//==============================================================================
+// <editor-fold desc="User interface"> {{{1
+
+template <typename Policy>
+auto prefer(Policy const& p, DesiredOccupancy occ) {
+  using new_policy_t =
+      Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy,
+                                                             DesiredOccupancy>;
+  new_policy_t pwo{p};
+  pwo.impl_set_desired_occupancy(occ);
+  return pwo;
+}
+
+template <typename Policy>
+constexpr auto prefer(Policy const& p, MaximizeOccupancy) {
+  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  using new_policy_t =
+      Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy,
+                                                             MaximizeOccupancy>;
+  return new_policy_t{p};
+}
+
+// </editor-fold> end User interface }}}1
+//==============================================================================
+
+}  // end namespace Experimental
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b087dac85559bd6dc67c983bdaad1a6675cfde9b
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>  // type_list
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+#ifndef KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP
+#define KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="Adapter for replacing/adding a trait"> {{{1
+
+//------------------------------------------------------------------------------
+
+// General strategy: given a TraitSpecification, go through the entries in the
+// parameter pack of the policy template and find the first one that returns
+// `true` for the nested `trait_matches_specification` variable template. If
+// that nested variable template is not found these overloads should be safely
+// ignored, and the trait can specialize PolicyTraitAdapterImpl to get the
+// desired behavior.
+
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="PolicyTraitMatcher"> {{{2
+
+// To handle the WorkTag case, we need more than just a predicate; we need
+// something that we can default to in the unspecialized case, just like we
+// do for AnalyzeExecPolicy
+template <class TraitSpec, class Trait, class Enable = void>
+struct PolicyTraitMatcher;
+
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcher<
+    TraitSpec, Trait,
+    std::enable_if_t<
+        TraitSpec::template trait_matches_specification<Trait>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="PolicyTraitAdaptorImpl specializations"> {{{2
+
+// Matching version, replace the trait
+template <class TraitSpec, template <class...> class PolicyTemplate,
+          class... ProcessedTraits, class MatchingTrait,
+          class... ToProcessTraits, class NewTrait>
+struct PolicyTraitAdaptorImpl<
+    TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>,
+    type_list<MatchingTrait, ToProcessTraits...>, NewTrait,
+    std::enable_if_t<PolicyTraitMatcher<TraitSpec, MatchingTrait>::value>> {
+  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, "");
+  using type = PolicyTemplate<ProcessedTraits..., NewTrait, ToProcessTraits...>;
+};
+
+// Non-matching version, check the next option
+template <class TraitSpec, template <class...> class PolicyTemplate,
+          class... ProcessedTraits, class NonMatchingTrait,
+          class... ToProcessTraits, class NewTrait>
+struct PolicyTraitAdaptorImpl<
+    TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>,
+    type_list<NonMatchingTrait, ToProcessTraits...>, NewTrait,
+    std::enable_if_t<!PolicyTraitMatcher<TraitSpec, NonMatchingTrait>::value>> {
+  using type = typename PolicyTraitAdaptorImpl<
+      TraitSpec, PolicyTemplate,
+      type_list<ProcessedTraits..., NonMatchingTrait>,
+      type_list<ToProcessTraits...>, NewTrait>::type;
+};
+
+// Base case: no matches found; just add the trait to the end of the list
+template <class TraitSpec, template <class...> class PolicyTemplate,
+          class... ProcessedTraits, class NewTrait>
+struct PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate,
+                              type_list<ProcessedTraits...>, type_list<>,
+                              NewTrait> {
+  static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, "");
+  using type = PolicyTemplate<ProcessedTraits..., NewTrait>;
+};
+
+// </editor-fold> end PolicyTraitAdaptorImpl specializations }}}2
+//------------------------------------------------------------------------------
+
+template <class TraitSpec, template <class...> class PolicyTemplate,
+          class... Traits, class NewTrait>
+struct PolicyTraitAdaptor<TraitSpec, PolicyTemplate<Traits...>, NewTrait>
+    : PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate, type_list<>,
+                             type_list<Traits...>, NewTrait> {};
+
+// </editor-fold> end Adapter for replacing/adding a trait }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="CRTP Base class for trait specifications"> {{{1
+
+template <class TraitSpec>
+struct TraitSpecificationBase {
+  using trait_specification = TraitSpec;
+  template <class Policy, class Trait>
+  using policy_with_trait =
+      typename PolicyTraitAdaptor<TraitSpec, Policy, Trait>::type;
+};
+
+// </editor-fold> end CRTP Base class for trait specifications }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74bab6fce2a632269a804971af3e50348e34c8b2
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_SCHEDULETRAIT_HPP
+#define KOKKOS_KOKKOS_SCHEDULETRAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // is_schedule_type, Schedule
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
+
+struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> {
+  struct base_traits {
+    static constexpr auto schedule_type_is_defaulted = true;
+
+    using schedule_type = Schedule<Static>;
+  };
+  template <class T>
+  using trait_matches_specification = is_schedule_type<T>;
+};
+
+// </editor-fold> end trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+template <class ScheduleType, class... Traits>
+struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(base_t::schedule_type_is_defaulted,
+                "Kokkos Error: More than one schedule type given");
+  static constexpr bool schedule_type_is_defaulted = false;
+  using schedule_type = Kokkos::Schedule<ScheduleType>;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+}  // end namespace Impl
+
+namespace Experimental {
+
+//==============================================================================
+// <editor-fold desc="User interface"> {{{1
+
+template <class Policy, class ScheduleType>
+constexpr auto require(Policy const& p, Kokkos::Schedule<ScheduleType>) {
+  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait<
+      Policy, Kokkos::Schedule<ScheduleType>>;
+  return new_policy_t{p};
+}
+
+// </editor-fold> end User interface }}}1
+//==============================================================================
+
+}  // end namespace Experimental
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_SCHEDULETRAIT_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8b9a0ca2d889b08116528803d0c1b096060ecad
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_TRAITS_FWD_HPP
+#define KOKKOS_KOKKOS_TRAITS_FWD_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template <class Enable, class... TraitsList>
+struct AnalyzeExecPolicy;
+
+template <class AnalysisResults>
+struct ExecPolicyTraitsWithDefaults;
+
+template <class TraitSpec, template <class...> class PolicyTemplate,
+          class AlreadyProcessedList, class ToProcessList, class NewTrait,
+          class Enable = void>
+struct PolicyTraitAdaptorImpl;
+
+template <class TraitSpec, class Policy, class NewTrait>
+struct PolicyTraitAdaptor;
+
+// A tag class for dependent defaults that must be handled by the
+// ExecPolicyTraitsWithDefaults wrapper, since their defaults depend on other
+// traits
+struct dependent_policy_trait_default;
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_TRAITS_FWD_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2656316fb934333655d0370f4dc3d40eea7bbb86
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -0,0 +1,114 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP
+#define KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // WorkItemProperty
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
+
+struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
+  struct base_traits {
+    using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t;
+  };
+  template <class T>
+  using trait_matches_specification =
+      Kokkos::Experimental::is_work_item_property<T>;
+};
+
+// </editor-fold> end trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+template <class Property, class... Traits>
+struct AnalyzeExecPolicy<
+    std::enable_if_t<
+        Kokkos::Experimental::is_work_item_property<Property>::value>,
+    Property, Traits...> : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(
+      std::is_same<typename base_t::work_item_property,
+                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
+      "Kokkos Error: More than one work item property given");
+  using work_item_property = Property;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+}  // end namespace Impl
+
+namespace Experimental {
+
+//==============================================================================
+// <editor-fold desc="User interface"> {{{1
+
+template <class Policy, unsigned long Property>
+constexpr auto require(const Policy p,
+                       WorkItemProperty::ImplWorkItemProperty<Property>) {
+  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait<
+      Policy, WorkItemProperty::ImplWorkItemProperty<Property>>;
+  return new_policy_t{p};
+}
+
+// </editor-fold> end User interface }}}1
+//==============================================================================
+
+}  // namespace Experimental
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..877005756a703b067c07c6f57c3fc4212f7484ca
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_KOKKOS_WORKTAGTRAIT_HPP
+#define KOKKOS_KOKKOS_WORKTAGTRAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Concepts.hpp>  // is_execution_space
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="trait specification"> {{{1
+
+struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
+  struct base_traits {
+    using work_tag = void;
+  };
+};
+
+// </editor-fold> end trait specification }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+// Since we don't have subsumption in pre-C++20, we need to have the work tag
+// "trait" handling code be unspecialized, so we handle it instead in a class
+// with a different name.
+template <class... Traits>
+struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+};
+
+template <class WorkTag, class... Traits>
+struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...>
+    : AnalyzeExecPolicy<void, Traits...> {
+  using base_t = AnalyzeExecPolicy<void, Traits...>;
+  using base_t::base_t;
+  static_assert(std::is_void<typename base_t::work_tag>::value,
+                "Kokkos Error: More than one work tag given");
+  using work_tag = WorkTag;
+};
+
+// This only works if this is not a partial specialization, so we have to
+// do the partial specialization elsewhere
+template <class Enable, class... Traits>
+struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> {
+  using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>;
+  using base_t::base_t;
+};
+
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1
+
+// In order to match the work tag trait the work tag "matcher" needs to be
+// unspecialized and the logic needs to be handled in a differently-named class,
+// just like above.
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcherHandleWorkTag : std::false_type {};
+
+template <class Trait>
+struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait>
+    : std::integral_constant<bool, !std::is_void<Trait>::value> {};
+
+template <class TraitSpec, class Trait, class Enable>
+struct PolicyTraitMatcher /* unspecialized! */
+    : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {};
+
+// </editor-fold> end PolicyTraitMatcher specializations }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_WORKTAGTRAIT_HPP
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 125560db3eb5c9ca28f7c48b8b6f66cb650cbfcf..5826208851090933ee296988287a6a633eb2c476 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -17,9 +17,8 @@ KOKKOS_ADD_TEST_LIBRARY(
 TARGET_COMPILE_DEFINITIONS(kokkos_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0)
 
 TARGET_INCLUDE_DIRECTORIES(kokkos_gtest PUBLIC ${GTEST_SOURCE_DIR})
-#Gtest minimally requires C++11
 IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu")))
-TARGET_COMPILE_FEATURES(kokkos_gtest PUBLIC cxx_std_11)
+  TARGET_COMPILE_FEATURES(kokkos_gtest PUBLIC cxx_std_14)
 ENDIF()
 
 # Suppress clang-tidy diagnostics on code that we do not have control over
@@ -40,11 +39,18 @@ SET(KOKKOS_HPX_FEATURE_LEVEL 999)
 SET(KOKKOS_HPX_NAME Experimental::HPX)
 SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
-SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 12)
+
+# FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8)
+ELSE()
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13)
+ENDIF()
+
 SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget)
 SET(KOKKOS_SERIAL_FEATURE_LEVEL 999)
 SET(KOKKOS_SERIAL_NAME Serial)
-SET(KOKKOS_SYCL_FEATURE_LEVEL 5)
+SET(KOKKOS_SYCL_FEATURE_LEVEL 999)
 SET(KOKKOS_SYCL_NAME Experimental::SYCL)
 SET(KOKKOS_THREADS_FEATURE_LEVEL 999)
 SET(KOKKOS_THREADS_NAME Threads)
@@ -57,6 +63,7 @@ SET(KOKKOS_THREADS_NAME Threads)
 #I will leave these alone for now because I don't need transitive dependencies on tests
 KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
 foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   # Because there is always an exception to the rule
@@ -73,7 +80,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     # Needed to split this for Windows NVCC, since it ends up putting everything on the
     # command line in an intermediate compilation step even if CMake generated a response
     # file. That then exceeded the shell command line max length.
-    set(${Tag}_SOURCES1)
+    set(${Tag}_SOURCES1A)
     foreach(Name
         AtomicOperations_int
         AtomicOperations_unsignedint
@@ -94,12 +101,30 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         FunctorAnalysis
         Init
         LocalDeepCopy
+        MathematicalFunctions
         MDRange_a
         MDRange_b
         MDRange_c
+        HostSharedPtr
+        HostSharedPtrAccessOnDevice
+        )
+      set(file ${dir}/Test${Tag}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${Tag}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${Tag}_SOURCES1A ${file})
+    endforeach()
+
+    set(${Tag}_SOURCES1B)
+    foreach(Name
         MDRange_d
         MDRange_e
         MDRange_f
+        NumericTraits
         Other
         RangePolicy
         RangePolicyRequire
@@ -121,10 +146,10 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
           "#include <Test${Name}.hpp>\n"
       )
       configure_file(${dir}/dummy.cpp ${file})
-      list(APPEND ${Tag}_SOURCES1 ${file})
+      list(APPEND ${Tag}_SOURCES1B ${file})
     endforeach()
 
-    SET(${Tag}_SOURCES2)
+    SET(${Tag}_SOURCES2A)
     foreach(Name
       TeamBasic
       TeamReductionScan
@@ -144,9 +169,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       ViewMapping_b
       ViewMapping_subview
       ViewOfClass
-      WorkGraph
-      View_64bit
       ViewResize
+      View_64bit
+      WorkGraph
       )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -156,7 +181,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
           "#include <Test${Name}.hpp>\n"
       )
       configure_file(${dir}/dummy.cpp ${file})
-      list(APPEND ${Tag}_SOURCES2 ${file})
+      list(APPEND ${Tag}_SOURCES2A ${file})
     endforeach()
 
     set(TagHostAccessible ${Tag})
@@ -164,7 +189,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       set(TagHostAccessible CudaUVM)
     elseif(Tag STREQUAL "HIP")
       set(TagHostAccessible HIPHostPinned)
+    elseif(Tag STREQUAL "SYCL")
+      set(TagHostAccessible SYCLSharedUSMSpace)
     endif()
+
+    set(${Tag}_SOURCES2B)
     foreach(Name
       SubView_a
       SubView_b
@@ -173,26 +202,57 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       SubView_c03
       SubView_c04
       SubView_c05
+      )
+      set(file ${dir}/Test${Tag}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${TagHostAccessible}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${Tag}_SOURCES2B ${file})
+    endforeach()
+
+    set(${Tag}_SOURCES2C)
+    foreach(Name
       SubView_c06
       SubView_c07
       SubView_c08
       SubView_c09
+      )
+      set(file ${dir}/Test${Tag}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${TagHostAccessible}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${Tag}_SOURCES2C ${file})
+    endforeach()
+
+    set(${Tag}_SOURCES2D)
+    foreach(Name
       SubView_c10
       SubView_c11
       SubView_c12
       SubView_c13
+      SubView_c14
       )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
       # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
       file(WRITE ${dir}/dummy.cpp
-	  "#include <Test${TagHostAccessible}_Category.hpp>\n"
+          "#include <Test${TagHostAccessible}_Category.hpp>\n"
           "#include <Test${Name}.hpp>\n"
       )
       configure_file(${dir}/dummy.cpp ${file})
-      list(APPEND ${Tag}_SOURCES2 ${file})
+      list(APPEND ${Tag}_SOURCES2D ${file})
     endforeach()
 
+    SET(${Tag}_SOURCES1 ${${Tag}_SOURCES1A} ${${Tag}_SOURCES1B})
+    SET(${Tag}_SOURCES2 ${${Tag}_SOURCES2A} ${${Tag}_SOURCES2B} ${${Tag}_SOURCES2C} ${${Tag}_SOURCES2D})
     SET(${Tag}_SOURCES ${${Tag}_SOURCES1} ${${Tag}_SOURCES2})
   endif()
 endforeach()
@@ -203,29 +263,81 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_e.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_f.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_b.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewOfClass.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp
-  )
+    )
+endif()
+
+# FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
+IF(KOKKOS_ENABLE_OPENMPTARGET
+   AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+  list(REMOVE_ITEM OpenMPTarget_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestScan.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamScan.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamReductionScan.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_float.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_int.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longlongint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedlongint.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicViews.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_BlockSizeDeduction.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c04.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c05.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c06.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c07.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c08.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c09.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c10.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c11.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c12.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c13.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_a.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_c.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_d.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_f.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewResize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp
+    )
 endif()
 
 if(Kokkos_ENABLE_SERIAL)
@@ -422,6 +534,7 @@ if(Kokkos_ENABLE_HIP)
       hip/TestHIPHostPinned_ViewMapping_a.cpp
       hip/TestHIPHostPinned_ViewMapping_b.cpp
       hip/TestHIPHostPinned_ViewMapping_subview.cpp
+      hip/TestHIP_AsyncLauncher.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HIPInterOpInit
@@ -438,80 +551,100 @@ if(Kokkos_ENABLE_HIP)
 endif()
 
 if(Kokkos_ENABLE_SYCL)
-  list(REMOVE_ITEM SYCL_SOURCES
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_int.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_unsignedint.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_longint.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_unsignedlongint.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_longlongint.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_double.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_float.cpp
+  list(REMOVE_ITEM SYCL_SOURCES1A
+       # FIXME_SYCL atomic_fetch_oper for large types to be implemented
        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexdouble.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexfloat.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicViews.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Atomics.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Atomics.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicViews.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_BlockSizeDeduction.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Crs.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_DeepCopyAlignment.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_LocalDeepCopy.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_a.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_b.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_c.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_d.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_e.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_MDRange_f.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Other.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reductions.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reducers_a.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reducers_b.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reducers_c.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reducers_d.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Reductions_DeviceView.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SharedAlloc.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c04.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c05.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c06.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c07.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c08.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c09.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c10.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c11.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_SubView_c12.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamBasic.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamReductionScan.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamScan.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamScratch.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamTeamSize.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_TeamVectorRange.cpp
-       ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UniqueToken.cpp
+  )
+
+  list(REMOVE_ITEM SYCL_SOURCES2A
        ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp
   )
 
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_SYCL
+    UnitTest_SYCL1A
+    SOURCES
+      UnitTestMainInit.cpp
+      ${SYCL_SOURCES1A}
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL1B
+    SOURCES
+      UnitTestMainInit.cpp
+      ${SYCL_SOURCES1B}
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL2A
+    SOURCES
+      UnitTestMainInit.cpp
+      ${SYCL_SOURCES2A}
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL2B
+    SOURCES
+      UnitTestMainInit.cpp
+      ${SYCL_SOURCES2B}
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL2C
+    SOURCES
+      UnitTestMainInit.cpp
+      ${SYCL_SOURCES2C}
+  )
+
+ KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
-      ${SYCL_SOURCES}
+      ${SYCL_SOURCES2D}
+  )
+ KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCLInterOpInit
+    SOURCES
+      UnitTestMain.cpp
+      sycl/TestSYCL_InterOp_Init.cpp
+  )
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCLInterOpInit_Context
+    SOURCES
+    UnitTestMainInit.cpp
+      sycl/TestSYCL_InterOp_Init_Context.cpp
+  )
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCLInterOpStreams
+    SOURCES
+      UnitTestMain.cpp
+     sycl/TestSYCL_InterOp_Streams.cpp
   )
 endif()
 
-SET(DEFAULT_DEVICE_SOURCES
-  UnitTestMainInit.cpp
-  default/TestDefaultDeviceType.cpp
-  default/TestDefaultDeviceType_a1.cpp
-  default/TestDefaultDeviceType_b1.cpp
-  default/TestDefaultDeviceType_c1.cpp
-  default/TestDefaultDeviceType_a2.cpp
-  default/TestDefaultDeviceType_b2.cpp
-  default/TestDefaultDeviceType_c2.cpp
-  default/TestDefaultDeviceType_a3.cpp
-  default/TestDefaultDeviceType_b3.cpp
-  default/TestDefaultDeviceType_c3.cpp
-  default/TestDefaultDeviceType_d.cpp
-  default/TestDefaultDeviceTypeResize.cpp
-)
+# FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
+if (KOKKOS_ENABLE_OPENMPTARGET
+    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+  SET(DEFAULT_DEVICE_SOURCES
+    UnitTestMainInit.cpp
+    default/TestDefaultDeviceType.cpp
+  )
+else()
+  SET(DEFAULT_DEVICE_SOURCES
+    UnitTestMainInit.cpp
+    default/TestDefaultDeviceType.cpp
+    default/TestDefaultDeviceType_a1.cpp
+    default/TestDefaultDeviceType_b1.cpp
+    default/TestDefaultDeviceType_c1.cpp
+    default/TestDefaultDeviceType_a2.cpp
+    default/TestDefaultDeviceType_b2.cpp
+    default/TestDefaultDeviceType_c2.cpp
+    default/TestDefaultDeviceType_a3.cpp
+    default/TestDefaultDeviceType_b3.cpp
+    default/TestDefaultDeviceType_c3.cpp
+    default/TestDefaultDeviceType_d.cpp
+    default/TestDefaultDeviceTypeResize.cpp
+  )
+endif()
 
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
   UnitTest_Default
@@ -572,6 +705,10 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       SOURCES tools/printing-tool.cpp
     )
 
+    if((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu")))
+      TARGET_COMPILE_FEATURES(kokkosprinter-tool PUBLIC cxx_std_14)
+    endif()
+
     KOKKOS_ADD_TEST_EXECUTABLE(
       ProfilingAllCalls
       tools/TestAllCalls.cpp
@@ -582,10 +719,50 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
     set(SIZE_REGEX "[0-9]*")
     set(SKIP_SCRATCH_INITIALIZATION_REGEX ".*")
 
-    KOKKOS_ADD_TEST( NAME ProfilingTestLibraryLoad
+    # check help works via environment variable
+    KOKKOS_ADD_TEST(
+      SKIP_TRIBITS
+      NAME ProfilingTestLibraryLoadHelp
+      EXE  ProfilingAllCalls
+      TOOL kokkosprinter-tool
+      ARGS --kokkos-tools-help
+      PASS_REGULAR_EXPRESSION
+        "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::")
+
+    # check help works via direct library specification
+    KOKKOS_ADD_TEST(
+      SKIP_TRIBITS
+      NAME ProfilingTestLibraryCmdLineHelp
+      EXE  ProfilingAllCalls
+      ARGS --kokkos-tools-help
+           --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
+      PASS_REGULAR_EXPRESSION
+        "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::")
+
+    KOKKOS_ADD_TEST(
+      SKIP_TRIBITS
+      NAME ProfilingTestLibraryLoad
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      ARGS --kokkos-tools-args="-c test delimit"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+    )
+
+    # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
+    #       "--kokkos-tools-args="-c test delimit""
+    # The bracket argument syntax: [=[ and ]=] used below ensures it is treated as
+    # a single argument:
+    #       "--kokkos-tools-args=-c test delimit"
+    #
+    # https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument
+    #
+    KOKKOS_ADD_TEST(
+      SKIP_TRIBITS
+      NAME ProfilingTestLibraryCmdLine
+      EXE  ProfilingAllCalls
+      ARGS [=[--kokkos-tools-args=-c test delimit]=]
+            --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
@@ -623,11 +800,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST(
 )
 endif()
 
-KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_HostBarrier
-  SOURCES UnitTestMain.cpp  TestHostBarrier.cpp
-)
-
 FUNCTION (KOKKOS_ADD_INCREMENTAL_TEST DEVICE)
   KOKKOS_OPTION( ${DEVICE}_EXCLUDE_TESTS "" STRING "Incremental test exclude list" )
   # Add unit test main
@@ -689,4 +861,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST(
   ARGS "one 2 THREE"
 )
 
-add_subdirectory(headers_self_contained)
+if (KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS AND NOT KOKKOS_HAS_TRILINOS)
+  add_subdirectory(headers_self_contained)
+endif()
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
index f039d889ee2762afa8ba23726d45ff298c856d49..390fc79a4755e46cbd61b28ee54d44814fa501d9 100644
--- a/packages/kokkos/core/unit_test/Makefile
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -32,7 +32,7 @@ override LDFLAGS += -lpthread
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
-KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test -I${KOKKOS_PATH}/core/unit_test/category_files
 
 TEST_TARGETS =
 TARGETS =
@@ -361,10 +361,6 @@ OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_HWLOC
 TEST_TARGETS += test-hwloc
 
-OBJ_HOST_BARRIER = TestHostBarrier.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_HostBarrier
-TEST_TARGETS += test-host-barrier
-
 OBJ_DEFAULT = UnitTestMainInit.o gtest-all.o
 ifneq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
@@ -432,9 +428,6 @@ KokkosCore_UnitTest_HPXInterOp: UnitTestMain.o gtest-all.o TestHPX_InterOp.o $(K
 KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HWLOC
 
-KokkosCore_UnitTest_HostBarrier: $(OBJ_HOST_BARRIER) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(EXTRA_PATH) $(OBJ_HOST_BARRIER) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HostBarrier
-
 KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker
 
@@ -482,9 +475,6 @@ test-hpx: KokkosCore_UnitTest_HPX
 test-hwloc: KokkosCore_UnitTest_HWLOC
 	./KokkosCore_UnitTest_HWLOC
 
-test-host-barrier: KokkosCore_UnitTest_HostBarrier
-	./KokkosCore_UnitTest_HostBarrier
-
 test-allocationtracker: KokkosCore_UnitTest_AllocationTracker
 	./KokkosCore_UnitTest_AllocationTracker
 
diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp
index 1051ae20f6d55979077fa9380526e3db3981e2a6..e41ad5257d64ad3acb3266a0354f18d291662377 100644
--- a/packages/kokkos/core/unit_test/TestAtomics.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomics.hpp
@@ -122,7 +122,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator==(const SuperScalar& src) {
+  bool operator==(const SuperScalar& src) const {
     bool compare = true;
     for (int i = 0; i < N; i++) {
       compare = compare && (val[i] == src.val[i]);
@@ -131,7 +131,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const SuperScalar& src) {
+  bool operator!=(const SuperScalar& src) const {
     bool compare = true;
     for (int i = 0; i < N; i++) {
       compare = compare && (val[i] == src.val[i]);
@@ -538,6 +538,8 @@ TEST(TEST_CATEGORY, atomics) {
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3)));
 
+// FIXME_SYCL atomics for large types to be implemented
+#ifndef KOKKOS_ENABLE_SYCL
   // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly
 #ifndef KOKKOS_ENABLE_HIP
   ASSERT_TRUE(
@@ -565,6 +567,7 @@ TEST(TEST_CATEGORY, atomics) {
 #endif
 #endif
 #endif
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
index b0307ec8cf2627e0695feeacd04f6d5a2ecf7fd8..b926058ebf990b0c7d0bff6f4c22b5bd4c12e2e8 100644
--- a/packages/kokkos/core/unit_test/TestComplex.hpp
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -414,13 +414,12 @@ TEST(TEST_CATEGORY, complex_special_funtions) {
 TEST(TEST_CATEGORY, complex_io) { testComplexIO(); }
 
 TEST(TEST_CATEGORY, complex_trivially_copyable) {
-  using RealType = double;
-
   // Kokkos::complex<RealType> is trivially copyable when RealType is
   // trivially copyable
   // Simply disable the check for IBM's XL compiler since we can't reliably
   // check for a version that defines relevant functions.
 #if !defined(__ibmxl__)
+  using RealType = double;
   // clang claims compatibility with gcc 4.2.1 but all versions tested know
   // about std::is_trivially_copyable.
   ASSERT_TRUE(std::is_trivially_copyable<Kokkos::complex<RealType>>::value ||
@@ -428,4 +427,92 @@ TEST(TEST_CATEGORY, complex_trivially_copyable) {
 #endif
 }
 
+template <class ExecSpace>
+struct TestBugPowAndLogComplex {
+  Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_pow;
+  Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_log;
+  TestBugPowAndLogComplex() : d_pow("pow", 2), d_log("log", 2) { test(); }
+  void test() {
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this);
+    auto h_pow =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_pow);
+    ASSERT_FLOAT_EQ(h_pow(0).real(), 18);
+    ASSERT_FLOAT_EQ(h_pow(0).imag(), 26);
+    ASSERT_FLOAT_EQ(h_pow(1).real(), -18);
+    ASSERT_FLOAT_EQ(h_pow(1).imag(), 26);
+    auto h_log =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_log);
+    ASSERT_FLOAT_EQ(h_log(0).real(), 1.151292546497023);
+    ASSERT_FLOAT_EQ(h_log(0).imag(), 0.3217505543966422);
+    ASSERT_FLOAT_EQ(h_log(1).real(), 1.151292546497023);
+    ASSERT_FLOAT_EQ(h_log(1).imag(), 2.819842099193151);
+  }
+  KOKKOS_FUNCTION void operator()(int) const {
+    d_pow(0) = Kokkos::pow(Kokkos::complex<double>(+3., 1.), 3.);
+    d_pow(1) = Kokkos::pow(Kokkos::complex<double>(-3., 1.), 3.);
+    d_log(0) = Kokkos::log(Kokkos::complex<double>(+3., 1.));
+    d_log(1) = Kokkos::log(Kokkos::complex<double>(-3., 1.));
+  }
+};
+
+TEST(TEST_CATEGORY, complex_issue_3865) {
+  TestBugPowAndLogComplex<TEST_EXECSPACE>();
+}
+
+TEST(TEST_CATEGORY, complex_issue_3867) {
+  ASSERT_EQ(Kokkos::pow(Kokkos::complex<double>(2., 1.), 3.),
+            Kokkos::pow(Kokkos::complex<double>(2., 1.), 3));
+  ASSERT_EQ(
+      Kokkos::pow(Kokkos::complex<double>(2., 1.), 3.),
+      Kokkos::pow(Kokkos::complex<double>(2., 1.), Kokkos::complex<double>(3)));
+
+  auto x = Kokkos::pow(Kokkos::complex<double>(2, 1),
+                       Kokkos::complex<double>(-3, 4));
+  auto y = Kokkos::complex<double>(
+      std::pow(std::complex<double>(2, 1), std::complex<double>(-3, 4)));
+  ASSERT_FLOAT_EQ(x.real(), y.real());
+  ASSERT_FLOAT_EQ(x.imag(), y.imag());
+
+#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE)         \
+  static_assert(                                                            \
+      std::is_same<RETURNTYPE,                                              \
+                   decltype(Kokkos::pow(std::declval<ARGTYPE1>(),           \
+                                        std::declval<ARGTYPE2>()))>::value, \
+      "");                                                                  \
+  static_assert(                                                            \
+      std::is_same<RETURNTYPE,                                              \
+                   decltype(Kokkos::pow(std::declval<ARGTYPE2>(),           \
+                                        std::declval<ARGTYPE1>()))>::value, \
+      "");
+
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, long double,
+                              Kokkos::complex<long double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, double,
+                              Kokkos::complex<long double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, float,
+                              Kokkos::complex<long double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, int,
+                              Kokkos::complex<long double>);
+
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, long double,
+                              Kokkos::complex<long double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, double,
+                              Kokkos::complex<double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, float,
+                              Kokkos::complex<double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, int,
+                              Kokkos::complex<double>);
+
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, long double,
+                              Kokkos::complex<long double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, double,
+                              Kokkos::complex<double>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, float,
+                              Kokkos::complex<float>);
+  CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, int,
+                              Kokkos::complex<double>);
+
+#undef CHECK_POW_COMPLEX_PROMOTION
+}
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 8158f4058082c65445b6fa15dddd7f56b476fd8d..49f8daf89eabca9b3aa7e1f06d7a10ceb23a6a24 100644
--- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -17,7 +17,7 @@ struct TestDeepCopy {
   static void reset_a_copy_and_b(
       Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char_copy,
       Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceB> b_char) {
-    const int N = b_char.extent(0);
+    const int N = b_char.extent_int(0);
     Kokkos::parallel_for(
         "TestDeepCopy: FillA_copy", policyA_t(0, N),
         KOKKOS_LAMBDA(const int& i) { a_char_copy(i) = char(0); });
@@ -29,7 +29,7 @@ struct TestDeepCopy {
   static int compare_equal(
       Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char_copy,
       Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char) {
-    const int N = a_char.extent(0);
+    const int N = a_char.extent_int(0);
     int errors;
     Kokkos::parallel_reduce(
         "TestDeepCopy: FillA_copy", policyA_t(0, N),
@@ -74,12 +74,12 @@ struct TestDeepCopy {
       int b_begin = 0;
       int b_end   = 0;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -92,12 +92,12 @@ struct TestDeepCopy {
       int b_begin = 0;
       int b_end   = 5;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -110,12 +110,12 @@ struct TestDeepCopy {
       int b_begin = 3;
       int b_end   = 0;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -128,12 +128,12 @@ struct TestDeepCopy {
       int b_begin = 3;
       int b_end   = 6;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -146,12 +146,12 @@ struct TestDeepCopy {
       int b_begin = 3;
       int b_end   = 6;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -164,12 +164,12 @@ struct TestDeepCopy {
       int b_begin = 2;
       int b_end   = 6;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
@@ -182,12 +182,12 @@ struct TestDeepCopy {
       int b_begin = 0;
       int b_end   = 8;
       auto a      = Kokkos::subview(
-          a_char, std::pair<int, int>(a_begin, a_char.extent(0) - a_end));
+          a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end));
       auto b = Kokkos::subview(
-          b_char, std::pair<int, int>(b_begin, b_char.extent(0) - b_end));
+          b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end));
       auto a_copy = Kokkos::subview(
           a_char_copy,
-          std::pair<int, int>(a_begin, a_char_copy.extent(0) - a_end));
+          std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end));
       Kokkos::deep_copy(b, a);
       Kokkos::deep_copy(a_copy, b);
       int check = compare_equal(a_copy, a);
diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
index feba5acdde68aac4952a054e4496a1a8b36f599a..db52a05d5d36d5919e101f60dd7652c92771c885 100644
--- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -68,24 +68,192 @@ enum OP_TESTS {
   POSTFIX_DEC,
   CADD_H_H,
   CADD_H_S,
+  CADD_S_H,
+  CADD_H_D,
+  CADD_D_H,
   CSUB_H_H,
   CSUB_H_S,
+  CSUB_S_H,
+  CSUB_H_D,
+  CSUB_D_H,
   CMUL_H_H,
   CMUL_H_S,
+  CMUL_S_H,
+  CMUL_H_D,
+  CMUL_D_H,
   CDIV_H_H,
   CDIV_H_S,
+  CDIV_S_H,
+  CDIV_H_D,
+  CDIV_D_H,
   ADD_H_H,
   ADD_H_S,
   ADD_S_H,
+  ADD_H_D,
+  ADD_D_H,
+  ADD_H_H_SZ,
+  ADD_H_S_SZ,
+  ADD_S_H_SZ,
+  ADD_H_D_SZ,
+  ADD_D_H_SZ,
+  ADD_SI_H,
+  ADD_SI_H_SZ,
+  ADD_I_H,
+  ADD_I_H_SZ,
+  ADD_LI_H,
+  ADD_LI_H_SZ,
+  ADD_LLI_H,
+  ADD_LLI_H_SZ,
+  ADD_USI_H,
+  ADD_USI_H_SZ,
+  ADD_UI_H,
+  ADD_UI_H_SZ,
+  ADD_ULI_H,
+  ADD_ULI_H_SZ,
+  ADD_ULLI_H,
+  ADD_ULLI_H_SZ,
+  ADD_H_SI,
+  ADD_H_SI_SZ,
+  ADD_H_I,
+  ADD_H_I_SZ,
+  ADD_H_LI,
+  ADD_H_LI_SZ,
+  ADD_H_LLI,
+  ADD_H_LLI_SZ,
+  ADD_H_USI,
+  ADD_H_USI_SZ,
+  ADD_H_UI,
+  ADD_H_UI_SZ,
+  ADD_H_ULI,
+  ADD_H_ULI_SZ,
+  ADD_H_ULLI,
+  ADD_H_ULLI_SZ,
   SUB_H_H,
   SUB_H_S,
   SUB_S_H,
+  SUB_H_D,
+  SUB_D_H,
+  SUB_H_H_SZ,
+  SUB_H_S_SZ,
+  SUB_S_H_SZ,
+  SUB_H_D_SZ,
+  SUB_D_H_SZ,
+  SUB_SI_H,
+  SUB_SI_H_SZ,
+  SUB_I_H,
+  SUB_I_H_SZ,
+  SUB_LI_H,
+  SUB_LI_H_SZ,
+  SUB_LLI_H,
+  SUB_LLI_H_SZ,
+  SUB_USI_H,
+  SUB_USI_H_SZ,
+  SUB_UI_H,
+  SUB_UI_H_SZ,
+  SUB_ULI_H,
+  SUB_ULI_H_SZ,
+  SUB_ULLI_H,
+  SUB_ULLI_H_SZ,
+  SUB_H_SI,
+  SUB_H_SI_SZ,
+  SUB_H_I,
+  SUB_H_I_SZ,
+  SUB_H_LI,
+  SUB_H_LI_SZ,
+  SUB_H_LLI,
+  SUB_H_LLI_SZ,
+  SUB_H_USI,
+  SUB_H_USI_SZ,
+  SUB_H_UI,
+  SUB_H_UI_SZ,
+  SUB_H_ULI,
+  SUB_H_ULI_SZ,
+  SUB_H_ULLI,
+  SUB_H_ULLI_SZ,
   MUL_H_H,
   MUL_H_S,
   MUL_S_H,
+  MUL_H_D,
+  MUL_D_H,
+  MUL_H_H_SZ,
+  MUL_H_S_SZ,
+  MUL_S_H_SZ,
+  MUL_H_D_SZ,
+  MUL_D_H_SZ,
+  MUL_SI_H,
+  MUL_SI_H_SZ,
+  MUL_I_H,
+  MUL_I_H_SZ,
+  MUL_LI_H,
+  MUL_LI_H_SZ,
+  MUL_LLI_H,
+  MUL_LLI_H_SZ,
+  MUL_USI_H,
+  MUL_USI_H_SZ,
+  MUL_UI_H,
+  MUL_UI_H_SZ,
+  MUL_ULI_H,
+  MUL_ULI_H_SZ,
+  MUL_ULLI_H,
+  MUL_ULLI_H_SZ,
+  MUL_H_SI,
+  MUL_H_SI_SZ,
+  MUL_H_I,
+  MUL_H_I_SZ,
+  MUL_H_LI,
+  MUL_H_LI_SZ,
+  MUL_H_LLI,
+  MUL_H_LLI_SZ,
+  MUL_H_USI,
+  MUL_H_USI_SZ,
+  MUL_H_UI,
+  MUL_H_UI_SZ,
+  MUL_H_ULI,
+  MUL_H_ULI_SZ,
+  MUL_H_ULLI,
+  MUL_H_ULLI_SZ,
   DIV_H_H,
   DIV_H_S,
   DIV_S_H,
+  DIV_H_D,
+  DIV_D_H,
+  DIV_H_H_SZ,
+  DIV_H_S_SZ,
+  DIV_S_H_SZ,
+  DIV_H_D_SZ,
+  DIV_D_H_SZ,
+  DIV_SI_H,
+  DIV_SI_H_SZ,
+  DIV_I_H,
+  DIV_I_H_SZ,
+  DIV_LI_H,
+  DIV_LI_H_SZ,
+  DIV_LLI_H,
+  DIV_LLI_H_SZ,
+  DIV_USI_H,
+  DIV_USI_H_SZ,
+  DIV_UI_H,
+  DIV_UI_H_SZ,
+  DIV_ULI_H,
+  DIV_ULI_H_SZ,
+  DIV_ULLI_H,
+  DIV_ULLI_H_SZ,
+  DIV_H_SI,
+  DIV_H_SI_SZ,
+  DIV_H_I,
+  DIV_H_I_SZ,
+  DIV_H_LI,
+  DIV_H_LI_SZ,
+  DIV_H_LLI,
+  DIV_H_LLI_SZ,
+  DIV_H_USI,
+  DIV_H_USI_SZ,
+  DIV_H_UI,
+  DIV_H_UI_SZ,
+  DIV_H_ULI,
+  DIV_H_ULI_SZ,
+  DIV_H_ULLI,
+  DIV_H_ULLI_SZ,
   NEG,
   AND,
   OR,
@@ -94,8 +262,7 @@ enum OP_TESTS {
   LT,
   GT,
   LE,
-  GE,
-  TW,
+  GE,  // TODO: TW,
   PASS_BY_REF,
   AO_IMPL_HALF,
   AO_HALF_T,
@@ -124,13 +291,102 @@ struct Functor_TestHalfOperators {
     }
   }
 
+  // BEGIN: Binary Arithmetic test helpers
+  template <class LhsType, class RhsType, class ExpectedResultType>
+  KOKKOS_INLINE_FUNCTION void test_add(int op_test_idx,
+                                       int op_test_sz_idx) const {
+    auto sum = static_cast<LhsType>(h_lhs) + static_cast<RhsType>(h_rhs);
+    actual_lhs(op_test_idx) = static_cast<double>(sum);
+
+    if (std::is_same<RhsType, half_t>::value &&
+        std::is_same<LhsType, half_t>::value) {
+      expected_lhs(op_test_idx) = d_lhs + d_rhs;
+    } else {
+      if (std::is_same<LhsType, half_t>::value)
+        expected_lhs(op_test_idx) = d_lhs + static_cast<RhsType>(d_rhs);
+      if (std::is_same<RhsType, half_t>::value)
+        expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) + d_rhs;
+    }
+
+    actual_lhs(op_test_sz_idx)   = sizeof(sum);
+    expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType);
+  }
+
+  template <class LhsType, class RhsType, class ExpectedResultType>
+  KOKKOS_INLINE_FUNCTION void test_sub(int op_test_idx,
+                                       int op_test_sz_idx) const {
+    auto result = static_cast<LhsType>(h_lhs) - static_cast<RhsType>(h_rhs);
+    actual_lhs(op_test_idx) = static_cast<double>(result);
+
+    if (std::is_same<RhsType, half_t>::value &&
+        std::is_same<LhsType, half_t>::value) {
+      expected_lhs(op_test_idx) = d_lhs - d_rhs;
+    } else {
+      if (std::is_same<LhsType, half_t>::value)
+        expected_lhs(op_test_idx) = d_lhs - static_cast<RhsType>(d_rhs);
+      if (std::is_same<RhsType, half_t>::value)
+        expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) - d_rhs;
+    }
+
+    actual_lhs(op_test_sz_idx)   = sizeof(result);
+    expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType);
+  }
+
+  template <class LhsType, class RhsType, class ExpectedResultType>
+  KOKKOS_INLINE_FUNCTION void test_mul(int op_test_idx,
+                                       int op_test_sz_idx) const {
+    auto result = static_cast<LhsType>(h_lhs) * static_cast<RhsType>(h_rhs);
+    actual_lhs(op_test_idx) = static_cast<double>(result);
+
+    if (std::is_same<RhsType, half_t>::value &&
+        std::is_same<LhsType, half_t>::value) {
+      expected_lhs(op_test_idx) = d_lhs * d_rhs;
+    } else {
+      if (std::is_same<LhsType, half_t>::value)
+        expected_lhs(op_test_idx) = d_lhs * static_cast<RhsType>(d_rhs);
+      if (std::is_same<RhsType, half_t>::value)
+        expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) * d_rhs;
+    }
+
+    actual_lhs(op_test_sz_idx)   = sizeof(result);
+    expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType);
+  }
+
+  template <class LhsType, class RhsType, class ExpectedResultType>
+  KOKKOS_INLINE_FUNCTION void test_div(int op_test_idx,
+                                       int op_test_sz_idx) const {
+    auto result = static_cast<LhsType>(h_lhs) / static_cast<RhsType>(h_rhs);
+    actual_lhs(op_test_idx) = static_cast<double>(result);
+
+    if (std::is_same<RhsType, half_t>::value &&
+        std::is_same<LhsType, half_t>::value) {
+      expected_lhs(op_test_idx) = d_lhs / d_rhs;
+    } else {
+      if (std::is_same<LhsType, half_t>::value)
+        expected_lhs(op_test_idx) = d_lhs / static_cast<RhsType>(d_rhs);
+      if (std::is_same<RhsType, half_t>::value)
+        expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) / d_rhs;
+    }
+
+    actual_lhs(op_test_sz_idx)   = sizeof(result);
+    expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType);
+  }
+  // END: Binary Arithmetic test helpers
+
   KOKKOS_FUNCTION
   void operator()(int) const {
     half_t tmp_lhs, tmp2_lhs, *tmp_ptr;
     double tmp_d_lhs;
+    float tmp_s_lhs;
     using half_impl_type = Kokkos::Impl::half_impl_t::type;
     half_impl_type half_tmp;
 
+    // Initialze output views to catch missing test invocations
+    for (int i = 0; i < N_OP_TESTS; ++i) {
+      actual_lhs(i)   = 1;
+      expected_lhs(i) = -1;
+    }
+
     tmp_lhs              = h_lhs;
     actual_lhs(ASSIGN)   = cast_from_half<double>(tmp_lhs);
     expected_lhs(ASSIGN) = d_lhs;
@@ -177,11 +433,29 @@ struct Functor_TestHalfOperators {
     expected_lhs(CADD_H_H) = d_lhs;
     expected_lhs(CADD_H_H) += d_rhs;
 
-    // tmp_lhs = h_lhs;
-    // tmp_lhs += static_cast<float>(d_rhs);
-    // actual_lhs(CADD_H_S)   = cast_from_half<double>(tmp_lhs);
-    // expected_lhs(CADD_H_S) = d_lhs;
-    // expected_lhs(CADD_H_S) += d_rhs;
+    tmp_lhs = h_lhs;
+    tmp_lhs += static_cast<float>(d_rhs);
+    actual_lhs(CADD_H_S)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_S) = d_lhs;
+    expected_lhs(CADD_H_S) += d_rhs;
+
+    tmp_s_lhs = static_cast<float>(h_lhs);
+    tmp_s_lhs += h_rhs;
+    actual_lhs(CADD_S_H)   = static_cast<double>(tmp_s_lhs);
+    expected_lhs(CADD_S_H) = d_lhs;
+    expected_lhs(CADD_S_H) += d_rhs;
+
+    tmp_lhs = static_cast<double>(h_lhs);
+    tmp_lhs += static_cast<double>(d_rhs);
+    actual_lhs(CADD_H_D)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_D) = d_lhs;
+    expected_lhs(CADD_H_D) += d_rhs;
+
+    tmp_d_lhs = static_cast<double>(h_lhs);
+    tmp_d_lhs += h_rhs;
+    actual_lhs(CADD_D_H)   = static_cast<double>(tmp_d_lhs);
+    expected_lhs(CADD_D_H) = d_lhs;
+    expected_lhs(CADD_D_H) += d_rhs;
 
     tmp_lhs = h_lhs;
     tmp_lhs -= h_rhs;
@@ -189,11 +463,29 @@ struct Functor_TestHalfOperators {
     expected_lhs(CSUB_H_H) = d_lhs;
     expected_lhs(CSUB_H_H) -= d_rhs;
 
-    // tmp_lhs = h_lhs;
-    // tmp_lhs -= static_cast<float>(d_rhs);
-    // actual_lhs(CSUB_H_S)   = cast_from_half<double>(tmp_lhs);
-    // expected_lhs(CSUB_H_S) = d_lhs;
-    // expected_lhs(CSUB_H_S) -= d_rhs;
+    tmp_lhs = h_lhs;
+    tmp_lhs -= static_cast<float>(d_rhs);
+    actual_lhs(CSUB_H_S)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CSUB_H_S) = d_lhs;
+    expected_lhs(CSUB_H_S) -= d_rhs;
+
+    tmp_s_lhs = static_cast<float>(h_lhs);
+    tmp_s_lhs -= h_rhs;
+    actual_lhs(CSUB_S_H)   = static_cast<double>(tmp_s_lhs);
+    expected_lhs(CSUB_S_H) = d_lhs;
+    expected_lhs(CSUB_S_H) -= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs -= d_rhs;
+    actual_lhs(CSUB_H_D)   = static_cast<double>(tmp_lhs);
+    expected_lhs(CSUB_H_D) = d_lhs;
+    expected_lhs(CSUB_H_D) -= d_rhs;
+
+    tmp_d_lhs = static_cast<double>(h_lhs);
+    tmp_d_lhs -= h_rhs;
+    actual_lhs(CSUB_D_H)   = tmp_d_lhs;
+    expected_lhs(CSUB_D_H) = d_lhs;
+    expected_lhs(CSUB_D_H) -= d_rhs;
 
     tmp_lhs = h_lhs;
     tmp_lhs *= h_rhs;
@@ -201,11 +493,29 @@ struct Functor_TestHalfOperators {
     expected_lhs(CMUL_H_H) = d_lhs;
     expected_lhs(CMUL_H_H) *= d_rhs;
 
-    // tmp_lhs = h_lhs;
-    // tmp_lhs *= static_cast<float>(d_rhs);
-    // actual_lhs(CMUL_H_S)   = cast_from_half<double>(tmp_lhs);
-    // expected_lhs(CMUL_H_S) = d_lhs;
-    // expected_lhs(CMUL_H_S) *= d_rhs;
+    tmp_lhs = h_lhs;
+    tmp_lhs *= static_cast<float>(d_rhs);
+    actual_lhs(CMUL_H_S)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CMUL_H_S) = d_lhs;
+    expected_lhs(CMUL_H_S) *= d_rhs;
+
+    tmp_s_lhs = static_cast<float>(h_lhs);
+    tmp_s_lhs *= h_rhs;
+    actual_lhs(CMUL_S_H)   = static_cast<double>(tmp_s_lhs);
+    expected_lhs(CMUL_S_H) = d_lhs;
+    expected_lhs(CMUL_S_H) *= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs *= d_rhs;
+    actual_lhs(CMUL_H_D)   = static_cast<double>(tmp_lhs);
+    expected_lhs(CMUL_H_D) = d_lhs;
+    expected_lhs(CMUL_H_D) *= d_rhs;
+
+    tmp_d_lhs = static_cast<double>(h_lhs);
+    tmp_d_lhs *= h_rhs;
+    actual_lhs(CMUL_D_H)   = tmp_d_lhs;
+    expected_lhs(CMUL_D_H) = d_lhs;
+    expected_lhs(CMUL_D_H) *= d_rhs;
 
     tmp_lhs = h_lhs;
     tmp_lhs /= h_rhs;
@@ -213,47 +523,249 @@ struct Functor_TestHalfOperators {
     expected_lhs(CDIV_H_H) = d_lhs;
     expected_lhs(CDIV_H_H) /= d_rhs;
 
-    // tmp_lhs = h_lhs;
-    // tmp_lhs /= static_cast<float>(d_rhs);
-    // actual_lhs(CDIV_H_S)   = cast_from_half<double>(tmp_lhs);
-    // expected_lhs(CDIV_H_S) = d_lhs;
-    // expected_lhs(CDIV_H_S) /= d_rhs;
-
-    actual_lhs(ADD_H_H)   = cast_from_half<double>(h_lhs + h_rhs);
-    expected_lhs(ADD_H_H) = d_lhs + d_rhs;
-    // actual_lhs(ADD_H_S) =
-    //    cast_from_half<double>(h_lhs + static_cast<float>(d_rhs));
-    // expected_lhs(ADD_H_S) = d_lhs + d_rhs;
-    // actual_lhs(ADD_S_H) =
-    //    cast_from_half<double>(static_cast<float>(d_lhs) + h_rhs);
-    // expected_lhs(ADD_S_H) = d_lhs + d_rhs;
-
-    actual_lhs(SUB_H_H)   = cast_from_half<double>(h_lhs - h_rhs);
-    expected_lhs(SUB_H_H) = d_lhs - d_rhs;
-    // actual_lhs(SUB_H_S) =
-    //    cast_from_half<double>(h_lhs - static_cast<float>(d_rhs));
-    // expected_lhs(SUB_H_S) = d_lhs - d_rhs;
-    // actual_lhs(SUB_S_H) =
-    //    cast_from_half<double>(static_cast<float>(d_lhs) - h_rhs);
-    // expected_lhs(SUB_S_H) = d_lhs - d_rhs;
-
-    actual_lhs(MUL_H_H)   = cast_from_half<double>(h_lhs * h_rhs);
-    expected_lhs(MUL_H_H) = d_lhs * d_rhs;
-    // actual_lhs(MUL_H_S) =
-    //    cast_from_half<double>(h_lhs * static_cast<float>(d_rhs));
-    // expected_lhs(MUL_H_S) = d_lhs * d_rhs;
-    // actual_lhs(MUL_S_H) =
-    //    cast_from_half<double>(static_cast<float>(d_lhs) * h_rhs);
-    // expected_lhs(MUL_S_H) = d_lhs * d_rhs;
-
-    actual_lhs(DIV_H_H)   = cast_from_half<double>(h_lhs / h_rhs);
-    expected_lhs(DIV_H_H) = d_lhs / d_rhs;
-    // actual_lhs(DIV_H_S) =
-    //    cast_from_half<double>(h_lhs / static_cast<float>(d_rhs));
-    // expected_lhs(DIV_H_S) = d_lhs / d_rhs;
-    // actual_lhs(DIV_S_H) =
-    //    cast_from_half<double>(static_cast<float>(d_lhs) / h_rhs);
-    // expected_lhs(DIV_S_H) = d_lhs / d_rhs;
+    tmp_lhs = h_lhs;
+    tmp_lhs /= static_cast<float>(d_rhs);
+    actual_lhs(CDIV_H_S)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CDIV_H_S) = d_lhs;
+    expected_lhs(CDIV_H_S) /= d_rhs;
+
+    tmp_s_lhs = static_cast<float>(h_lhs);
+    tmp_s_lhs /= h_rhs;
+    actual_lhs(CDIV_S_H)   = static_cast<double>(tmp_s_lhs);
+    expected_lhs(CDIV_S_H) = d_lhs;
+    expected_lhs(CDIV_S_H) /= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs /= d_rhs;
+    actual_lhs(CDIV_H_D)   = static_cast<double>(tmp_lhs);
+    expected_lhs(CDIV_H_D) = d_lhs;
+    expected_lhs(CDIV_H_D) /= d_rhs;
+
+    tmp_d_lhs = static_cast<double>(h_lhs);
+    tmp_d_lhs /= h_rhs;
+    actual_lhs(CDIV_D_H)   = tmp_d_lhs;
+    expected_lhs(CDIV_D_H) = d_lhs;
+    expected_lhs(CDIV_D_H) /= d_rhs;
+
+    test_add<half_t, half_t, half_t>(ADD_H_H, ADD_H_H_SZ);
+    test_add<float, half_t, float>(ADD_S_H, ADD_S_H_SZ);
+    test_add<double, half_t, double>(ADD_D_H, ADD_D_H_SZ);
+    test_add<short int, half_t, half_t>(ADD_SI_H, ADD_SI_H_SZ);
+    test_add<int, half_t, half_t>(ADD_I_H, ADD_I_H_SZ);
+    test_add<long int, half_t, half_t>(ADD_LI_H, ADD_LI_H_SZ);
+    test_add<long long int, half_t, half_t>(ADD_LLI_H, ADD_LLI_H_SZ);
+    test_add<half_t, float, float>(ADD_H_S, ADD_H_S_SZ);
+    test_add<half_t, double, double>(ADD_H_D, ADD_H_D_SZ);
+    test_add<half_t, short int, half_t>(ADD_H_SI, ADD_H_SI_SZ);
+    test_add<half_t, int, half_t>(ADD_H_I, ADD_H_I_SZ);
+    test_add<half_t, long int, half_t>(ADD_H_LI, ADD_H_LI_SZ);
+    test_add<half_t, long long int, half_t>(ADD_H_LLI, ADD_H_LLI_SZ);
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_lhs >= 0) {
+      test_add<unsigned short int, half_t, half_t>(ADD_USI_H, ADD_USI_H_SZ);
+      test_add<unsigned int, half_t, half_t>(ADD_UI_H, ADD_UI_H_SZ);
+      test_add<unsigned long int, half_t, half_t>(ADD_ULI_H, ADD_ULI_H_SZ);
+      test_add<unsigned long long int, half_t, half_t>(ADD_ULLI_H,
+                                                       ADD_ULLI_H_SZ);
+    } else {
+      actual_lhs(ADD_USI_H)     = expected_lhs(ADD_USI_H);
+      actual_lhs(ADD_USI_H_SZ)  = expected_lhs(ADD_USI_H_SZ);
+      actual_lhs(ADD_UI_H)      = expected_lhs(ADD_UI_H);
+      actual_lhs(ADD_UI_H_SZ)   = expected_lhs(ADD_UI_H_SZ);
+      actual_lhs(ADD_ULI_H)     = expected_lhs(ADD_ULI_H);
+      actual_lhs(ADD_ULI_H_SZ)  = expected_lhs(ADD_ULI_H_SZ);
+      actual_lhs(ADD_ULLI_H)    = expected_lhs(ADD_ULLI_H);
+      actual_lhs(ADD_ULLI_H_SZ) = expected_lhs(ADD_ULLI_H_SZ);
+    }
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_rhs >= 0) {
+      test_add<half_t, unsigned short int, half_t>(ADD_H_USI, ADD_H_USI_SZ);
+      test_add<half_t, unsigned int, half_t>(ADD_H_UI, ADD_H_UI_SZ);
+      test_add<half_t, unsigned long int, half_t>(ADD_H_ULI, ADD_H_ULI_SZ);
+      test_add<half_t, unsigned long long int, half_t>(ADD_H_ULLI,
+                                                       ADD_H_ULLI_SZ);
+    } else {
+      actual_lhs(ADD_H_USI)     = expected_lhs(ADD_H_USI);
+      actual_lhs(ADD_H_USI_SZ)  = expected_lhs(ADD_H_USI_SZ);
+      actual_lhs(ADD_H_UI)      = expected_lhs(ADD_H_UI);
+      actual_lhs(ADD_H_UI_SZ)   = expected_lhs(ADD_H_UI_SZ);
+      actual_lhs(ADD_H_ULI)     = expected_lhs(ADD_H_ULI);
+      actual_lhs(ADD_H_ULI_SZ)  = expected_lhs(ADD_H_ULI_SZ);
+      actual_lhs(ADD_H_ULLI)    = expected_lhs(ADD_H_ULLI);
+      actual_lhs(ADD_H_ULLI_SZ) = expected_lhs(ADD_H_ULLI_SZ);
+    }
+
+    test_sub<half_t, half_t, half_t>(SUB_H_H, SUB_H_H_SZ);
+    test_sub<float, half_t, float>(SUB_S_H, SUB_S_H_SZ);
+    test_sub<double, half_t, double>(SUB_D_H, SUB_D_H_SZ);
+    test_sub<short int, half_t, half_t>(SUB_SI_H, SUB_SI_H_SZ);
+    test_sub<int, half_t, half_t>(SUB_I_H, SUB_I_H_SZ);
+    test_sub<long int, half_t, half_t>(SUB_LI_H, SUB_LI_H_SZ);
+    test_sub<long long int, half_t, half_t>(SUB_LLI_H, SUB_LLI_H_SZ);
+    test_sub<half_t, float, float>(SUB_H_S, SUB_H_S_SZ);
+    test_sub<half_t, double, double>(SUB_H_D, SUB_H_D_SZ);
+    test_sub<half_t, short int, half_t>(SUB_H_SI, SUB_H_SI_SZ);
+    test_sub<half_t, int, half_t>(SUB_H_I, SUB_H_I_SZ);
+    test_sub<half_t, long int, half_t>(SUB_H_LI, SUB_H_LI_SZ);
+    test_sub<half_t, long long int, half_t>(SUB_H_LLI, SUB_H_LLI_SZ);
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_lhs >= half_t(0)) {
+      test_sub<unsigned short int, half_t, half_t>(SUB_USI_H, SUB_USI_H_SZ);
+      test_sub<unsigned int, half_t, half_t>(SUB_UI_H, SUB_UI_H_SZ);
+      test_sub<unsigned long int, half_t, half_t>(SUB_ULI_H, SUB_ULI_H_SZ);
+      test_sub<unsigned long long int, half_t, half_t>(SUB_ULLI_H,
+                                                       SUB_ULLI_H_SZ);
+    } else {
+      actual_lhs(SUB_USI_H)     = expected_lhs(SUB_USI_H);
+      actual_lhs(SUB_USI_H_SZ)  = expected_lhs(SUB_USI_H_SZ);
+      actual_lhs(SUB_UI_H)      = expected_lhs(SUB_UI_H);
+      actual_lhs(SUB_UI_H_SZ)   = expected_lhs(SUB_UI_H_SZ);
+      actual_lhs(SUB_ULI_H)     = expected_lhs(SUB_ULI_H);
+      actual_lhs(SUB_ULI_H_SZ)  = expected_lhs(SUB_ULI_H_SZ);
+      actual_lhs(SUB_ULLI_H)    = expected_lhs(SUB_ULLI_H);
+      actual_lhs(SUB_ULLI_H_SZ) = expected_lhs(SUB_ULLI_H_SZ);
+    }
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_rhs >= half_t(0)) {
+      test_sub<half_t, unsigned short int, half_t>(SUB_H_USI, SUB_H_USI_SZ);
+      test_sub<half_t, unsigned int, half_t>(SUB_H_UI, SUB_H_UI_SZ);
+      test_sub<half_t, unsigned long int, half_t>(SUB_H_ULI, SUB_H_ULI_SZ);
+      test_sub<half_t, unsigned long long int, half_t>(SUB_H_ULLI,
+                                                       SUB_H_ULLI_SZ);
+    } else {
+      actual_lhs(SUB_H_USI)     = expected_lhs(SUB_H_USI);
+      actual_lhs(SUB_H_USI_SZ)  = expected_lhs(SUB_H_USI_SZ);
+      actual_lhs(SUB_H_UI)      = expected_lhs(SUB_H_UI);
+      actual_lhs(SUB_H_UI_SZ)   = expected_lhs(SUB_H_UI_SZ);
+      actual_lhs(SUB_H_ULI)     = expected_lhs(SUB_H_ULI);
+      actual_lhs(SUB_H_ULI_SZ)  = expected_lhs(SUB_H_ULI_SZ);
+      actual_lhs(SUB_H_ULLI)    = expected_lhs(SUB_H_ULLI);
+      actual_lhs(SUB_H_ULLI_SZ) = expected_lhs(SUB_H_ULLI_SZ);
+    }
+
+    test_mul<half_t, half_t, half_t>(MUL_H_H, MUL_H_H_SZ);
+    test_mul<float, half_t, float>(MUL_S_H, MUL_S_H_SZ);
+    test_mul<double, half_t, double>(MUL_D_H, MUL_D_H_SZ);
+    test_mul<short int, half_t, half_t>(MUL_SI_H, MUL_SI_H_SZ);
+    test_mul<int, half_t, half_t>(MUL_I_H, MUL_I_H_SZ);
+    test_mul<long int, half_t, half_t>(MUL_LI_H, MUL_LI_H_SZ);
+    test_mul<long long int, half_t, half_t>(MUL_LLI_H, MUL_LLI_H_SZ);
+    test_mul<half_t, float, float>(MUL_H_S, MUL_H_S_SZ);
+    test_mul<half_t, double, double>(MUL_H_D, MUL_H_D_SZ);
+    test_mul<half_t, short int, half_t>(MUL_H_SI, MUL_H_SI_SZ);
+    test_mul<half_t, int, half_t>(MUL_H_I, MUL_H_I_SZ);
+    test_mul<half_t, long int, half_t>(MUL_H_LI, MUL_H_LI_SZ);
+    test_mul<half_t, long long int, half_t>(MUL_H_LLI, MUL_H_LLI_SZ);
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_lhs >= half_t(0)) {
+      test_mul<unsigned short int, half_t, half_t>(MUL_USI_H, MUL_USI_H_SZ);
+      test_mul<unsigned int, half_t, half_t>(MUL_UI_H, MUL_UI_H_SZ);
+      test_mul<unsigned long int, half_t, half_t>(MUL_ULI_H, MUL_ULI_H_SZ);
+      test_mul<unsigned long long int, half_t, half_t>(MUL_ULLI_H,
+                                                       MUL_ULLI_H_SZ);
+    } else {
+      actual_lhs(MUL_USI_H)     = expected_lhs(MUL_USI_H);
+      actual_lhs(MUL_UI_H)      = expected_lhs(MUL_UI_H);
+      actual_lhs(MUL_ULI_H)     = expected_lhs(MUL_ULI_H);
+      actual_lhs(MUL_ULLI_H)    = expected_lhs(MUL_ULLI_H);
+      actual_lhs(MUL_USI_H_SZ)  = expected_lhs(MUL_USI_H_SZ);
+      actual_lhs(MUL_UI_H_SZ)   = expected_lhs(MUL_UI_H_SZ);
+      actual_lhs(MUL_ULI_H_SZ)  = expected_lhs(MUL_ULI_H_SZ);
+      actual_lhs(MUL_ULLI_H_SZ) = expected_lhs(MUL_ULLI_H_SZ);
+    }
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_rhs >= half_t(0)) {
+      test_mul<half_t, unsigned short int, half_t>(MUL_H_USI, MUL_H_USI_SZ);
+      test_mul<half_t, unsigned int, half_t>(MUL_H_UI, MUL_H_UI_SZ);
+      test_mul<half_t, unsigned long int, half_t>(MUL_H_ULI, MUL_H_ULI_SZ);
+      test_mul<half_t, unsigned long long int, half_t>(MUL_H_ULLI,
+                                                       MUL_H_ULLI_SZ);
+    } else {
+      actual_lhs(MUL_H_USI)     = expected_lhs(MUL_H_USI);
+      actual_lhs(MUL_H_UI)      = expected_lhs(MUL_H_UI);
+      actual_lhs(MUL_H_ULI)     = expected_lhs(MUL_H_ULI);
+      actual_lhs(MUL_H_ULLI)    = expected_lhs(MUL_H_ULLI);
+      actual_lhs(MUL_H_USI_SZ)  = expected_lhs(MUL_H_USI_SZ);
+      actual_lhs(MUL_H_UI_SZ)   = expected_lhs(MUL_H_UI_SZ);
+      actual_lhs(MUL_H_ULI_SZ)  = expected_lhs(MUL_H_ULI_SZ);
+      actual_lhs(MUL_H_ULLI_SZ) = expected_lhs(MUL_H_ULLI_SZ);
+    }
+
+    test_div<half_t, half_t, half_t>(DIV_H_H, DIV_H_H_SZ);
+    test_div<float, half_t, float>(DIV_S_H, DIV_S_H_SZ);
+    test_div<double, half_t, double>(DIV_D_H, DIV_D_H_SZ);
+    test_div<short int, half_t, half_t>(DIV_SI_H, DIV_SI_H_SZ);
+    test_div<int, half_t, half_t>(DIV_I_H, DIV_I_H_SZ);
+    test_div<long int, half_t, half_t>(DIV_LI_H, DIV_LI_H_SZ);
+    test_div<long long int, half_t, half_t>(DIV_LLI_H, DIV_LLI_H_SZ);
+    test_div<half_t, float, float>(DIV_H_S, DIV_H_S_SZ);
+    test_div<half_t, double, double>(DIV_H_D, DIV_H_D_SZ);
+
+    // Check for division by zero due to truncation by half_t -> integral cast
+    if (h_rhs >= half_t(1) || h_rhs <= half_t(-1)) {
+      test_div<half_t, short int, half_t>(DIV_H_SI, DIV_H_SI_SZ);
+      test_div<half_t, int, half_t>(DIV_H_I, DIV_H_I_SZ);
+      test_div<half_t, long int, half_t>(DIV_H_LI, DIV_H_LI_SZ);
+      test_div<half_t, long long int, half_t>(DIV_H_LLI, DIV_H_LLI_SZ);
+    } else {
+      actual_lhs(DIV_H_SI)     = expected_lhs(DIV_H_SI);
+      actual_lhs(DIV_H_I)      = expected_lhs(DIV_H_I);
+      actual_lhs(DIV_H_LI)     = expected_lhs(DIV_H_LI);
+      actual_lhs(DIV_H_LLI)    = expected_lhs(DIV_H_LLI);
+      actual_lhs(DIV_H_SI_SZ)  = expected_lhs(DIV_H_SI_SZ);
+      actual_lhs(DIV_H_I_SZ)   = expected_lhs(DIV_H_I_SZ);
+      actual_lhs(DIV_H_LI_SZ)  = expected_lhs(DIV_H_LI_SZ);
+      actual_lhs(DIV_H_LLI_SZ) = expected_lhs(DIV_H_LLI_SZ);
+    }
+
+    // Check for potential overflow due to negative half_t -> unsigned integral
+    // cast
+    if (h_lhs >= half_t(0)) {
+      test_div<unsigned short int, half_t, half_t>(DIV_USI_H, DIV_USI_H_SZ);
+      test_div<unsigned int, half_t, half_t>(DIV_UI_H, DIV_UI_H_SZ);
+      test_div<unsigned long int, half_t, half_t>(DIV_ULI_H, DIV_ULI_H_SZ);
+      test_div<unsigned long long int, half_t, half_t>(DIV_ULLI_H,
+                                                       DIV_ULLI_H_SZ);
+    } else {
+      actual_lhs(DIV_USI_H)     = expected_lhs(DIV_USI_H);
+      actual_lhs(DIV_UI_H)      = expected_lhs(DIV_UI_H);
+      actual_lhs(DIV_ULI_H)     = expected_lhs(DIV_ULI_H);
+      actual_lhs(DIV_ULLI_H)    = expected_lhs(DIV_ULLI_H);
+      actual_lhs(DIV_USI_H_SZ)  = expected_lhs(DIV_USI_H_SZ);
+      actual_lhs(DIV_UI_H_SZ)   = expected_lhs(DIV_UI_H_SZ);
+      actual_lhs(DIV_ULI_H_SZ)  = expected_lhs(DIV_ULI_H_SZ);
+      actual_lhs(DIV_ULLI_H_SZ) = expected_lhs(DIV_ULLI_H_SZ);
+    }
+
+    // Check for division by zero due to truncation by half_t -> integral cast
+    if (h_rhs >= half_t(1)) {
+      test_div<half_t, unsigned short int, half_t>(DIV_H_USI, DIV_H_USI_SZ);
+      test_div<half_t, unsigned int, half_t>(DIV_H_UI, DIV_H_UI_SZ);
+      test_div<half_t, unsigned long int, half_t>(DIV_H_ULI, DIV_H_ULI_SZ);
+      test_div<half_t, unsigned long long int, half_t>(DIV_H_ULLI,
+                                                       DIV_H_ULLI_SZ);
+    } else {
+      actual_lhs(DIV_H_USI)     = expected_lhs(DIV_H_USI);
+      actual_lhs(DIV_H_USI_SZ)  = expected_lhs(DIV_H_USI_SZ);
+      actual_lhs(DIV_H_UI)      = expected_lhs(DIV_H_UI);
+      actual_lhs(DIV_H_UI_SZ)   = expected_lhs(DIV_H_UI_SZ);
+      actual_lhs(DIV_H_ULI)     = expected_lhs(DIV_H_ULI);
+      actual_lhs(DIV_H_ULI_SZ)  = expected_lhs(DIV_H_ULI_SZ);
+      actual_lhs(DIV_H_ULLI)    = expected_lhs(DIV_H_ULLI);
+      actual_lhs(DIV_H_ULLI_SZ) = expected_lhs(DIV_H_ULLI_SZ);
+    }
 
     // TODO: figure out why operator{!,&&,||} are returning __nv_bool
     actual_lhs(NEG)   = static_cast<double>(!h_lhs);
@@ -303,7 +815,8 @@ struct Functor_TestHalfOperators {
     actual_lhs(AO_HALF_T)   = cast_from_half<double>(tmp_ptr[0]);
     expected_lhs(AO_HALF_T) = d_lhs;
 
-    // TODO: Add upcast / downcast tests using sizeof
+    // TODO: Check upcasting and downcasting in large expressions involving
+    // integral and floating point types
   }
 };
 
@@ -320,7 +833,7 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
   Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
   Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
   for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
-    // printf("%lf\n", actual_lhs(op));
+    // printf("op_test = %d\n", op_test);
     ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
                 epsilon);
     ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
@@ -351,8 +864,13 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
 void test_half_operators() {
   half_t h_lhs = half_t(0.23458), h_rhs = half_t(0.67898);
   for (int i = -3; i < 2; i++) {
+    // printf("%f OP %f\n", float(h_lhs + cast_to_half(i + 1)), float(h_rhs +
+    // cast_to_half(i)));
     __test_half_operators(h_lhs + cast_to_half(i + 1), h_rhs + cast_to_half(i));
+    // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0));
+    // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i));
   }
+  // TODO: __test_half_operators(0, 0);
 }
 
 TEST(TEST_CATEGORY, half_operators) { test_half_operators(); }
diff --git a/packages/kokkos/core/unit_test/TestHostBarrier.cpp b/packages/kokkos/core/unit_test/TestHostBarrier.cpp
deleted file mode 100644
index 230ba2fb83a278d2fef085ae7540789e0e1a74d2..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/TestHostBarrier.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <gtest/gtest.h>
-
-namespace Test {
-
-TEST(host_barrier, openmp) {}
-
-}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..731e9fc36d9bf17aa93fc1e458d3058bf7a37994
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp
@@ -0,0 +1,155 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_HostSharedPtr.hpp>
+
+#include <gtest/gtest.h>
+
+using Kokkos::Impl::HostSharedPtr;
+
+TEST(TEST_CATEGORY, host_shared_ptr_use_count) {
+  using T = int;
+  {
+    HostSharedPtr<T> p1;
+    EXPECT_EQ(p1.use_count(), 0);
+  }
+  {
+    HostSharedPtr<T> p1(nullptr);
+    EXPECT_EQ(p1.use_count(), 0);
+  }
+  {
+    HostSharedPtr<T> p1(new T());
+    EXPECT_EQ(p1.use_count(), 1);
+  }
+  {
+    HostSharedPtr<T> p1(new T(), [](T* p) { delete p; });
+    EXPECT_EQ(p1.use_count(), 1);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    EXPECT_EQ(p1.use_count(), 1);
+  }
+  {
+    HostSharedPtr<T> p1(new T());
+    HostSharedPtr<T> p2(p1);  // copy construction
+    EXPECT_EQ(p1.use_count(), 2);
+    EXPECT_EQ(p2.use_count(), 2);
+  }
+  {
+    HostSharedPtr<T> p1(new T());
+    HostSharedPtr<T> p2(std::move(p1));  // move construction
+    EXPECT_EQ(p2.use_count(), 1);
+  }
+  {
+    HostSharedPtr<T> p1(new T());
+    HostSharedPtr<T> p2;
+    p2 = p1;  // copy assignment
+    EXPECT_EQ(p1.use_count(), 2);
+    EXPECT_EQ(p2.use_count(), 2);
+  }
+  {
+    HostSharedPtr<T> p1(new T());
+    HostSharedPtr<T> p2;
+    p2 = std::move(p1);  // move assignment
+    EXPECT_EQ(p2.use_count(), 1);
+  }
+}
+
+TEST(TEST_CATEGORY, host_shared_ptr_get) {
+  using T = int;
+  {
+    HostSharedPtr<T> p1;
+    EXPECT_EQ(p1.get(), nullptr);
+  }
+  {
+    HostSharedPtr<T> p1(nullptr);
+    EXPECT_EQ(p1.get(), nullptr);
+  }
+  {
+    T* p_i = new T();
+    HostSharedPtr<T> p1(p_i);
+    EXPECT_EQ(p1.get(), p_i);
+  }
+  {
+    T* p_i = new T();
+    HostSharedPtr<T> p1(p_i, [](T* p) { delete p; });
+    EXPECT_EQ(p1.get(), p_i);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    EXPECT_EQ(p1.get(), &i);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    HostSharedPtr<T> p2(p1);  // copy construction
+    EXPECT_EQ(p1.get(), &i);
+    EXPECT_EQ(p1.get(), &i);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    HostSharedPtr<T> p2(std::move(p1));  // move construction
+    EXPECT_EQ(p1.get(), nullptr);
+    EXPECT_EQ(p2.get(), &i);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    HostSharedPtr<T> p2;
+    p2 = p1;  // copy assignment
+    EXPECT_EQ(p1.get(), &i);
+    EXPECT_EQ(p1.get(), &i);
+  }
+  {
+    T i;
+    HostSharedPtr<T> p1(&i, [](T*) {});
+    HostSharedPtr<T> p2;
+    p2 = std::move(p1);  // move assignment
+    EXPECT_EQ(p1.get(), nullptr);
+    EXPECT_EQ(p2.get(), &i);
+  }
+}
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18d1ac85188ca17cd7d127d3187103f42402be18
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_HostSharedPtr.hpp>
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+using Kokkos::Impl::HostSharedPtr;
+
+namespace {
+
+class Data {
+  Kokkos::Array<char, 64> d;
+
+ public:
+  KOKKOS_FUNCTION void write(char const* c) {
+    for (int i = 0; i < 64 && c; ++i, ++c) {
+      d[i] = *c;
+    }
+  }
+};
+
+template <class SmartPtr>
+struct CheckAccessStoredPointerAndDereferenceOnDevice {
+  SmartPtr m_device_ptr;
+  using ElementType = typename SmartPtr::element_type;
+  static_assert(std::is_same<ElementType, Data>::value, "");
+
+  CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr)
+      : m_device_ptr(device_ptr) {
+    int errors;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this,
+                            errors);
+    EXPECT_EQ(errors, 0);
+  }
+
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    auto raw_ptr = m_device_ptr.get();  // get
+
+    auto tmp = new (raw_ptr) ElementType();
+
+    auto& obj = *m_device_ptr;  // operator*
+    if (&obj != raw_ptr) ++e;
+
+    m_device_ptr->write("hello world");  // operator->
+
+    tmp->~ElementType();
+  }
+};
+
+template <class Ptr>
+CheckAccessStoredPointerAndDereferenceOnDevice<Ptr>
+check_access_stored_pointer_and_dereference_on_device(Ptr p) {
+  return {p};
+}
+
+template <class SmartPtr>
+struct CheckSpecialMembersOnDevice {
+  SmartPtr m_device_ptr;
+
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    SmartPtr p1 = m_device_ptr;   // copy construction
+    SmartPtr p2 = std::move(p1);  // move construction
+
+    p1 = p2;             // copy assignment
+    p2 = std::move(p1);  // move assignment
+
+    SmartPtr p3;  // default constructor
+    if (p3) ++e;
+    SmartPtr p4{nullptr};
+    if (p4) ++e;
+  }
+
+  CheckSpecialMembersOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) {
+    int errors;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this,
+                            errors);
+    EXPECT_EQ(errors, 0);
+  }
+};
+
+template <class Ptr>
+CheckSpecialMembersOnDevice<Ptr> check_special_members_on_device(Ptr p) {
+  return {p};
+}
+
+}  // namespace
+
+TEST(TEST_CATEGORY, host_shared_ptr_dereference_on_device) {
+  using T = Data;
+
+  using MemorySpace = TEST_EXECSPACE::memory_space;
+
+  HostSharedPtr<T> device_ptr(
+      static_cast<T*>(Kokkos::kokkos_malloc<MemorySpace>(sizeof(T))),
+      [](T* p) { Kokkos::kokkos_free<MemorySpace>(p); });
+
+  check_access_stored_pointer_and_dereference_on_device(device_ptr);
+}
+
+// FIXME_OPENMPTARGET
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) {
+  using T = Data;
+
+  using MemorySpace = TEST_EXECSPACE::memory_space;
+
+  HostSharedPtr<T> device_ptr(
+      static_cast<T*>(Kokkos::kokkos_malloc<MemorySpace>(sizeof(T))),
+      [](T* p) { Kokkos::kokkos_free<MemorySpace>(p); });
+
+  check_special_members_on_device(device_ptr);
+}
+#endif
diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp
index 90a31fe0f38c565026109f0360c13df19aaf27e1..5618e40989b185a0233de2b20d6dec6636c9fe51 100644
--- a/packages/kokkos/core/unit_test/TestMDRange.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange.hpp
@@ -378,7 +378,7 @@ struct TestMDRange_2D {
       parallel_reduce(
           "rank2-min-reducer", range,
           KOKKOS_LAMBDA(const int i, const int j, double &min_val) {
-            min_val = fmin(v_in(i, j), min_val);
+            min_val = Kokkos::Experimental::fmin(v_in(i, j), min_val);
           },
           reducer_scalar);
 
@@ -1411,8 +1411,13 @@ struct TestMDRange_3D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}},
+                       tile_type{{8, 8, 4}});
+#else
       range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}},
                        tile_type{{8, 8, 8}});
+#endif
       TestMDRange_3D functor(N0, N1, N2);
 
       parallel_for(range, functor);
@@ -1874,8 +1879,13 @@ struct TestMDRange_4D {
       int s1 = 1;
       int s2 = 1;
       int s3 = 1;
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{s0, s1, s2, s3}},
+                       point_type{{N0, N1, N2, N3}}, tile_type{{3, 11, 3, 2}});
+#else
       range_type range(point_type{{s0, s1, s2, s3}},
                        point_type{{N0, N1, N2, N3}}, tile_type{{3, 11, 3, 3}});
+#endif
       TestMDRange_4D functor(N0, N1, N2, N3);
 
       parallel_for(range, functor);
@@ -2440,9 +2450,16 @@ struct TestMDRange_5D {
       int s2 = 1;
       int s3 = 1;
       int s4 = 1;
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{s0, s1, s2, s3, s4}},
+                       point_type{{N0, N1, N2, N3, N4}},
+                       tile_type{{3, 3, 3, 3, 3}});
+#else
       range_type range(point_type{{s0, s1, s2, s3, s4}},
                        point_type{{N0, N1, N2, N3, N4}},
                        tile_type{{3, 3, 3, 3, 5}});
+#endif
+
       TestMDRange_5D functor(N0, N1, N2, N3, N4);
 
       parallel_for(range, functor);
@@ -2767,9 +2784,16 @@ struct TestMDRange_6D {
       int s3 = 1;
       int s4 = 1;
       int s5 = 1;
+
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{s0, s1, s2, s3, s4, s5}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{3, 3, 3, 2, 2, 2}});
+#else
       range_type range(point_type{{s0, s1, s2, s3, s4, s5}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{3, 3, 3, 3, 3, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -2786,8 +2810,13 @@ struct TestMDRange_6D {
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
+                       {{3, 3, 3, 2, 2, 2}});
+#else
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 3, 3, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -2806,8 +2835,14 @@ struct TestMDRange_6D {
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
+                       {{3, 3, 3, 2, 2, 2}});
+#else
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 3, 3, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -2826,8 +2861,13 @@ struct TestMDRange_6D {
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
+                       {{3, 3, 3, 2, 2, 2}});
+#else
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 3, 3, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -2851,7 +2891,6 @@ struct TestMDRange_6D {
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
-
       range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 1}});
 
@@ -2890,9 +2929,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{2, 4, 4, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{2, 4, 6, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3029,11 +3074,17 @@ struct TestMDRange_6D {
       int s3 = 1;
       int s4 = 1;
       int s5 = 1;
-      range_type range(
-          point_type{{s0, s1, s2, s3, s4, s5}},
-          point_type{{N0, N1, N2, N3, N4, N5}},
-          tile_type{{3, 3, 3, 3, 2, 3}});  // tile dims 3,3,3,3,3,3 more than
-                                           // cuda can handle with debugging
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{s0, s1, s2, s3, s4, s5}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{3, 3, 3, 2, 2, 2}});
+#else
+      // tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      range_type range(point_type{{s0, s1, s2, s3, s4, s5}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{3, 3, 3, 3, 2, 3}});
+#endif
+
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
       parallel_for(range, functor);
@@ -3070,9 +3121,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3107,9 +3164,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3144,9 +3207,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3181,9 +3250,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3218,9 +3293,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
@@ -3255,9 +3336,15 @@ struct TestMDRange_6D {
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
+#ifdef KOKKOS_ENABLE_SYCL
+      range_type range(point_type{{0, 0, 0, 0, 0, 0}},
+                       point_type{{N0, N1, N2, N3, N4, N5}},
+                       tile_type{{4, 4, 2, 2, 2, 2}});
+#else
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
                        point_type{{N0, N1, N2, N3, N4, N5}},
                        tile_type{{4, 4, 4, 2, 2, 2}});
+#endif
 
       TestMDRange_6D functor(N0, N1, N2, N3, N4, N5);
 
diff --git a/packages/kokkos/core/unit_test/TestMDRange_a.hpp b/packages/kokkos/core/unit_test/TestMDRange_a.hpp
index 3f3d13e7ce9243f962b2e88c3c832d26959fe1bf..0f2abd6d65e921bf07b512984b17ac3d5f5fe67c 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_a.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_a.hpp
@@ -47,7 +47,10 @@
 namespace Test {
 
 TEST(TEST_CATEGORY, mdrange_5d) {
+// FIXME_OPENMPTARGET requires MDRange parallel_reduce
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
   TestMDRange_5D<TEST_EXECSPACE>::test_reduce5(100, 10, 10, 10, 5);
+#endif
   TestMDRange_5D<TEST_EXECSPACE>::test_for5(100, 10, 10, 10, 5);
 }
 
diff --git a/packages/kokkos/core/unit_test/TestMDRange_b.hpp b/packages/kokkos/core/unit_test/TestMDRange_b.hpp
index f43ba38c7ca840b62878580c28563fe16e84fcaf..85410d5c27fa6ba60c5d8034efa0d30bb1f6db7a 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_b.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_b.hpp
@@ -48,7 +48,10 @@ namespace Test {
 
 TEST(TEST_CATEGORY, mdrange_6d) {
   TestMDRange_6D<TEST_EXECSPACE>::test_for6(10, 10, 10, 10, 5, 5);
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  // FIXME_OPENMPTARGET requires MDRange parallel_reduce
   TestMDRange_6D<TEST_EXECSPACE>::test_reduce6(100, 10, 10, 10, 5, 5);
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMDRange_c.hpp b/packages/kokkos/core/unit_test/TestMDRange_c.hpp
index dbaed8ec128b81f4eeae49a3073d9c19f47ea2bc..9f597ec54b5777fe1df4f7e831c20e9eb1eab38d 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_c.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_c.hpp
@@ -47,13 +47,18 @@
 namespace Test {
 
 TEST(TEST_CATEGORY, mdrange_2d) {
+// FIXME_OPENMPTARGET requires MDRange parallel_reduce
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
   TestMDRange_2D<TEST_EXECSPACE>::test_reduce2(100, 100);
+#endif
   TestMDRange_2D<TEST_EXECSPACE>::test_for2(100, 100);
 }
 
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, mdrange_array_reduce) {
   TestMDRange_ReduceArray_2D<TEST_EXECSPACE>::test_arrayreduce2(4, 5);
   TestMDRange_ReduceArray_3D<TEST_EXECSPACE>::test_arrayreduce3(4, 5, 10);
 }
+#endif
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMDRange_d.hpp b/packages/kokkos/core/unit_test/TestMDRange_d.hpp
index ea5300a1a33735ca183356e6a2a9d9fe4a063645..5ca57ccf483710bdfb7907bcd4e10d03d13ecc39 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_d.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_d.hpp
@@ -49,10 +49,14 @@ namespace Test {
 TEST(TEST_CATEGORY, mdrange_3d) {
   TestMDRange_3D<TEST_EXECSPACE>::test_for3(1, 10, 100);
   TestMDRange_3D<TEST_EXECSPACE>::test_for3(100, 10, 100);
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  // FIXME_OPENMPTARGET requires MDRange parallel_reduce
   TestMDRange_3D<TEST_EXECSPACE>::test_reduce3(1, 10, 100);
   TestMDRange_3D<TEST_EXECSPACE>::test_reduce3(100, 10, 100);
+#endif
 }
 
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, mdrange_neg_idx) {
   TestMDRange_2D_NegIdx<TEST_EXECSPACE>::test_2D_negidx(128, 32);
   TestMDRange_3D_NegIdx<TEST_EXECSPACE>::test_3D_negidx(128, 32, 8);
@@ -60,5 +64,6 @@ TEST(TEST_CATEGORY, mdrange_neg_idx) {
   TestMDRange_5D_NegIdx<TEST_EXECSPACE>::test_5D_negidx(128, 32, 8, 8, 4);
   TestMDRange_6D_NegIdx<TEST_EXECSPACE>::test_6D_negidx(128, 32, 8, 8, 4, 2);
 }
+#endif
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMDRange_e.hpp b/packages/kokkos/core/unit_test/TestMDRange_e.hpp
index d1576e5e5be0d2e1efa50614bda8b10b657ad7be..b9754e63d56bacb497fec4f932eb348c38f6c79f 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_e.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_e.hpp
@@ -47,7 +47,10 @@
 namespace Test {
 
 TEST(TEST_CATEGORY, mdrange_4d) {
+// FIXME_OPENMPTARGET requires MDRange parallel_reduce
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
   TestMDRange_4D<TEST_EXECSPACE>::test_reduce4(100, 10, 10, 10);
+#endif
   TestMDRange_4D<TEST_EXECSPACE>::test_for4(100, 10, 10, 10);
 }
 
diff --git a/packages/kokkos/core/unit_test/TestMDRange_f.hpp b/packages/kokkos/core/unit_test/TestMDRange_f.hpp
index 4f10ce273724b7b07c0d5b9733be8448bb9edee5..2cef1324d7c75059dfa50417d940bd7bf40a9763 100644
--- a/packages/kokkos/core/unit_test/TestMDRange_f.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange_f.hpp
@@ -46,8 +46,11 @@
 
 namespace Test {
 
+// FIXME_OPENMPTARGET requires MDRange parallel_reduce
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, mdrange_scalar) {
   TestMDRange_ReduceScalar<TEST_EXECSPACE>::test_scalar_reduce(12, 11);
 }
+#endif
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..777f91aea3e560981d5dde05767f1726d8a1542f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -0,0 +1,871 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+#include <initializer_list>
+#include <type_traits>
+#include "Kokkos_ExecPolicy.hpp"
+#include "Kokkos_Parallel_Reduce.hpp"
+
+#include <cfloat>
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else
+#define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#endif
+
+// clang-format off
+template <class>
+struct math_unary_function_return_type;
+// Floating-point types
+template <> struct math_unary_function_return_type<      float> { using type =       float; };
+template <> struct math_unary_function_return_type<     double> { using type =      double; };
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+template <> struct math_unary_function_return_type<long double> { using type = long double; };
+#endif
+// Integral types
+template <> struct math_unary_function_return_type<              bool> { using type = double; };
+template <> struct math_unary_function_return_type<             short> { using type = double; };
+template <> struct math_unary_function_return_type<    unsigned short> { using type = double; };
+template <> struct math_unary_function_return_type<               int> { using type = double; };
+template <> struct math_unary_function_return_type<      unsigned int> { using type = double; };
+template <> struct math_unary_function_return_type<              long> { using type = double; };
+template <> struct math_unary_function_return_type<     unsigned long> { using type = double; };
+template <> struct math_unary_function_return_type<         long long> { using type = double; };
+template <> struct math_unary_function_return_type<unsigned long long> { using type = double; };
+template <class T>
+using math_unary_function_return_type_t = typename math_unary_function_return_type<T>::type;
+template <class, class>
+struct math_binary_function_return_type;
+template <> struct math_binary_function_return_type<             float,              float> { using type =       float; };
+template <> struct math_binary_function_return_type<             float,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<             float,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<             float, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<            double,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<            double, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<             short,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<             short, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<               int,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<               int, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<              long,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<              long, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<         long long, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<    unsigned short, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<      unsigned int, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<     unsigned long, unsigned long long> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,              float> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,             double> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,               bool> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,              short> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,                int> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,               long> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,          long long> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,     unsigned short> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,       unsigned int> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long,      unsigned long> { using type =      double; };
+template <> struct math_binary_function_return_type<unsigned long long, unsigned long long> { using type =      double; };
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+template <> struct math_binary_function_return_type<             float,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<            double,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,              float> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,             double> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,               bool> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,              short> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,                int> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,               long> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,          long long> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,     unsigned short> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,       unsigned int> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double,      unsigned long> { using type = long double; };
+template <> struct math_binary_function_return_type<       long double, unsigned long long> { using type = long double; };
+template <> struct math_binary_function_return_type<             short,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<               int,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<              long,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<         long long,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<    unsigned short,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<      unsigned int,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<     unsigned long,        long double> { using type = long double; };
+template <> struct math_binary_function_return_type<unsigned long long,        long double> { using type = long double; };
+#endif
+template <class T, class U>
+using math_binary_function_return_type_t = typename math_binary_function_return_type<T, U>::type;
+// clang-format on
+
+struct FloatingPointComparison {
+ private:
+  template <class T>
+  KOKKOS_FUNCTION double eps(T) const {
+    return DBL_EPSILON;
+  }
+  KOKKOS_FUNCTION
+  double eps(float) const { return FLT_EPSILON; }
+  KOKKOS_FUNCTION
+  double eps(long double) const { return LDBL_EPSILON; }
+
+  // Using absolute here instead of abs, since we actually test abs ...
+  template <class T>
+  KOKKOS_FUNCTION typename std::enable_if<std::is_signed<T>::value, T>::type
+  absolute(T val) const {
+    return val < T(0) ? -val : val;
+  }
+
+  template <class T>
+  KOKKOS_FUNCTION typename std::enable_if<!std::is_signed<T>::value, T>::type
+  absolute(T val) const {
+    return val;
+  }
+
+ public:
+  template <class FPT>
+  KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const {
+    auto abs_tol = eps(fpv) * ulp;
+
+    bool ar = absolute(fpv) < abs_tol;
+    if (!ar) {
+#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
+      printf("absolute value exceeds tolerance [|%e| > %e]\n", (double)fpv,
+             abs_tol);
+#endif
+    }
+
+    return ar;
+  }
+
+  template <class Lhs, class Rhs>
+  KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs,
+                               double ulp) const {
+    if (lhs == 0) {
+      return compare_near_zero(rhs, ulp);
+    } else if (rhs == 0) {
+      return compare_near_zero(lhs, ulp);
+    } else {
+      auto rel_tol     = (eps(lhs) < eps(rhs) ? eps(lhs) : eps(rhs)) * ulp;
+      double abs_diff  = static_cast<double>(rhs > lhs ? rhs - lhs : lhs - rhs);
+      double min_denom = static_cast<double>(
+          absolute(rhs) < absolute(lhs) ? absolute(rhs) : absolute(lhs));
+      double rel_diff = abs_diff / min_denom;
+      bool ar         = rel_diff < rel_tol;
+      if (!ar) {
+#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
+        printf("relative difference exceeds tolerance [%e > %e]\n",
+               (double)rel_diff, rel_tol);
+#endif
+      }
+
+      return ar;
+    }
+  }
+};
+
+template <class>
+struct math_function_name;
+
+#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                           \
+  struct MathUnaryFunction_##FUNC {                                            \
+    template <typename T>                                                      \
+    static KOKKOS_FUNCTION auto eval(T x) {                                    \
+      static_assert(std::is_same<decltype(Kokkos::Experimental::FUNC((T)0)),   \
+                                 math_unary_function_return_type_t<T>>::value, \
+                    "");                                                       \
+      return Kokkos::Experimental::FUNC(x);                                    \
+    }                                                                          \
+    template <typename T>                                                      \
+    static auto eval_std(T x) {                                                \
+      static_assert(std::is_same<decltype(std::FUNC((T)0)),                    \
+                                 math_unary_function_return_type_t<T>>::value, \
+                    "");                                                       \
+      return std::FUNC(x);                                                     \
+    }                                                                          \
+    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }          \
+  };                                                                           \
+  using kk_##FUNC = MathUnaryFunction_##FUNC;                                  \
+  template <>                                                                  \
+  struct math_function_name<MathUnaryFunction_##FUNC> {                        \
+    static constexpr char name[] = #FUNC;                                      \
+  };                                                                           \
+  constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[]
+
+// Generally the expected ULP error should come from here:
+// https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html
+// For now 1s largely seem to work ...
+DEFINE_UNARY_FUNCTION_EVAL(exp, 2);
+DEFINE_UNARY_FUNCTION_EVAL(exp2, 2);
+DEFINE_UNARY_FUNCTION_EVAL(expm1, 2);
+DEFINE_UNARY_FUNCTION_EVAL(log, 2);
+DEFINE_UNARY_FUNCTION_EVAL(log10, 2);
+DEFINE_UNARY_FUNCTION_EVAL(log2, 2);
+DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
+
+DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
+DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
+
+DEFINE_UNARY_FUNCTION_EVAL(sin, 2);
+DEFINE_UNARY_FUNCTION_EVAL(cos, 2);
+DEFINE_UNARY_FUNCTION_EVAL(tan, 2);
+DEFINE_UNARY_FUNCTION_EVAL(asin, 2);
+DEFINE_UNARY_FUNCTION_EVAL(acos, 2);
+DEFINE_UNARY_FUNCTION_EVAL(atan, 2);
+
+DEFINE_UNARY_FUNCTION_EVAL(sinh, 2);
+DEFINE_UNARY_FUNCTION_EVAL(cosh, 2);
+DEFINE_UNARY_FUNCTION_EVAL(tanh, 2);
+DEFINE_UNARY_FUNCTION_EVAL(asinh, 4);
+DEFINE_UNARY_FUNCTION_EVAL(acosh, 2);
+DEFINE_UNARY_FUNCTION_EVAL(atanh, 2);
+
+DEFINE_UNARY_FUNCTION_EVAL(erf, 2);
+DEFINE_UNARY_FUNCTION_EVAL(erfc, 5);
+// has a larger error due to some impls doing integer exact.
+// We cast always to double leading to larger difference when comparing our
+// tgamma to std::tgamma on the host.
+DEFINE_UNARY_FUNCTION_EVAL(tgamma, 200);
+DEFINE_UNARY_FUNCTION_EVAL(lgamma, 2);
+
+DEFINE_UNARY_FUNCTION_EVAL(ceil, 2);
+DEFINE_UNARY_FUNCTION_EVAL(floor, 2);
+DEFINE_UNARY_FUNCTION_EVAL(trunc, 2);
+#ifndef KOKKOS_ENABLE_SYCL
+DEFINE_UNARY_FUNCTION_EVAL(nearbyint, 2);
+#endif
+
+#undef DEFINE_UNARY_FUNCTION_EVAL
+
+#define DEFINE_BINARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                    \
+  struct MathBinaryFunction_##FUNC {                                     \
+    template <typename T, typename U>                                    \
+    static KOKKOS_FUNCTION auto eval(T x, U y) {                         \
+      static_assert(                                                     \
+          std::is_same<decltype(Kokkos::Experimental::FUNC((T)0, (U)0)), \
+                       math_binary_function_return_type_t<T, U>>::value, \
+          "");                                                           \
+      return Kokkos::Experimental::FUNC(x, y);                           \
+    }                                                                    \
+    template <typename T, typename U>                                    \
+    static auto eval_std(T x, U y) {                                     \
+      static_assert(                                                     \
+          std::is_same<decltype(std::FUNC((T)0, (U)0)),                  \
+                       math_binary_function_return_type_t<T, U>>::value, \
+          "");                                                           \
+      return std::FUNC(x, y);                                            \
+    }                                                                    \
+    static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; }    \
+  };                                                                     \
+  using kk_##FUNC = MathBinaryFunction_##FUNC;                           \
+  template <>                                                            \
+  struct math_function_name<MathBinaryFunction_##FUNC> {                 \
+    static constexpr char name[] = #FUNC;                                \
+  };                                                                     \
+  constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
+
+DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
+DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
+
+#undef DEFINE_BINARY_FUNCTION_EVAL
+
+// clang-format off
+template <class>
+struct type_helper;
+#define DEFINE_TYPE_NAME(T) \
+template <> struct type_helper<T> { static char const * name() { return #T; } };
+DEFINE_TYPE_NAME(bool)
+DEFINE_TYPE_NAME(int)
+DEFINE_TYPE_NAME(long)
+DEFINE_TYPE_NAME(long long)
+DEFINE_TYPE_NAME(unsigned int)
+DEFINE_TYPE_NAME(unsigned long)
+DEFINE_TYPE_NAME(unsigned long long)
+DEFINE_TYPE_NAME(float)
+DEFINE_TYPE_NAME(double)
+DEFINE_TYPE_NAME(long double)
+#undef DEFINE_TYPE_NAME
+// clang-format on
+
+template <class Space, class Func, class Arg, std::size_t N,
+          class Ret = math_unary_function_return_type_t<Arg>>
+struct TestMathUnaryFunction : FloatingPointComparison {
+  Arg val_[N];
+  Ret res_[N];
+  TestMathUnaryFunction(const Arg (&val)[N]) {
+    std::cout << math_function_name<Func>::name << "("
+              << type_helper<Arg>::name() << ")\n";
+    std::copy(val, val + N, val_);
+    std::transform(val, val + N, res_,
+                   [](auto x) { return Func::eval_std(x); });
+    run();
+  }
+  void run() {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, N), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int i, int& e) const {
+    bool ar = compare(Func::eval(val_[i]), res_[i], Func::ulp_factor());
+    if (!ar) {
+      ++e;
+#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
+      printf("value at %f which is %f was expected to be %f\n", (double)val_[i],
+             (double)Func::eval(val_[i]), (double)res_[i]);
+#endif
+    }
+  }
+};
+
+template <class Space, class... Func, class Arg, std::size_t N>
+void do_test_math_unary_function(const Arg (&x)[N]) {
+  (void)std::initializer_list<int>{
+      (TestMathUnaryFunction<Space, Func, Arg, N>(x), 0)...};
+}
+
+#define TEST_MATH_FUNCTION(FUNC) \
+  do_test_math_unary_function<TEST_EXECSPACE, MathUnaryFunction_##FUNC>
+
+template <class Space, class Func, class Arg1, class Arg2,
+          class Ret = math_binary_function_return_type_t<Arg1, Arg2>>
+struct TestMathBinaryFunction : FloatingPointComparison {
+  Arg1 val1_;
+  Arg2 val2_;
+  Ret res_;
+  TestMathBinaryFunction(Arg1 val1, Arg2 val2)
+      : val1_(val1), val2_(val2), res_(Func::eval_std(val1, val2)) {
+    std::cout << math_function_name<Func>::name << "("
+              << type_helper<Arg1>::name() << ", " << type_helper<Arg2>::name()
+              << ")\n";
+    run();
+  }
+  void run() {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    bool ar = compare(Func::eval(val1_, val2_), res_, Func::ulp_factor());
+    if (!ar) {
+      ++e;
+#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
+      printf("value at %f, %f which is %f was expected to be %f\n",
+             (double)val1_, (double)val2_, (double)Func::eval(val1_, val2_),
+             (double)res_);
+#endif
+    }
+  }
+};
+
+template <class Space, class... Func, class Arg1, class Arg2>
+void do_test_math_binary_function(Arg1 arg1, Arg2 arg2) {
+  (void)std::initializer_list<int>{
+      (TestMathBinaryFunction<Space, Func, Arg1, Arg2>(arg1, arg2), 0)...};
+}
+
+TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
+  TEST_MATH_FUNCTION(sin)({true, false});
+  TEST_MATH_FUNCTION(sin)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(sin)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(sin)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(sin)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(sin)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(sin)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(sin)({.1f, .2f, .3f});
+  TEST_MATH_FUNCTION(sin)({.4, .5, .6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(sin)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(cos)({true, false});
+  TEST_MATH_FUNCTION(cos)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(cos)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(cos)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(cos)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(cos)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(cos)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(cos)({.1f, .2f, .3f});
+  TEST_MATH_FUNCTION(cos)({.4, .5, .6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(cos)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(tan)({true, false});
+  TEST_MATH_FUNCTION(tan)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(tan)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(tan)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(tan)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(tan)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(tan)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(tan)({.1f, .2f, .3f});
+  TEST_MATH_FUNCTION(tan)({.4, .5, .6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(tan)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(asin)({true, false});
+  TEST_MATH_FUNCTION(asin)({-1, 0, 1});
+  TEST_MATH_FUNCTION(asin)({-1l, 0l, 1l});
+  TEST_MATH_FUNCTION(asin)({-1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(asin)({0u, 1u});
+  TEST_MATH_FUNCTION(asin)({0ul, 1ul});
+  TEST_MATH_FUNCTION(asin)({0ull, 1ull});
+  TEST_MATH_FUNCTION(asin)({-1.f, .9f, -.8f, .7f, -.6f});
+  TEST_MATH_FUNCTION(asin)({-.5, .4, -.3, .2, -.1, 0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(asin)({-.5l, .3l, 0.l, .2l, .4l, .6l});
+#endif
+
+  TEST_MATH_FUNCTION(acos)({true, false});
+  TEST_MATH_FUNCTION(acos)({-1, 0, 1});
+  TEST_MATH_FUNCTION(acos)({-1l, 0l, 1l});
+  TEST_MATH_FUNCTION(acos)({-1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(acos)({0u, 1u});
+  TEST_MATH_FUNCTION(acos)({0ul, 1ul});
+  TEST_MATH_FUNCTION(acos)({0ull, 1ull});
+  TEST_MATH_FUNCTION(acos)({-1.f, .9f, -.8f, .7f, -.6f});
+  TEST_MATH_FUNCTION(acos)({-.5, .4, -.3, .2, -.1, 0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(acos)({-.5l, .3l, 0.l, .2l, .4l, .6l});
+#endif
+
+  TEST_MATH_FUNCTION(atan)({true, false});
+  TEST_MATH_FUNCTION(atan)({-1, 0, 1});
+  TEST_MATH_FUNCTION(atan)({-1l, 0l, 1l});
+  TEST_MATH_FUNCTION(atan)({-1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(atan)({0u, 1u});
+  TEST_MATH_FUNCTION(atan)({0ul, 1ul});
+  TEST_MATH_FUNCTION(atan)({0ull, 1ull});
+  TEST_MATH_FUNCTION(atan)({-1.5f, 1.3f, -1.1f, .9f, -.7f, .5f});
+  TEST_MATH_FUNCTION(atan)({1.4, -1.2, 1., -.8, .6, -.4, .2, -0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(atan)({-.98l, .67l, -54.l, .34l, -.21l});
+#endif
+
+  // TODO atan2
+}
+
+TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
+  TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11});
+  TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l});
+  TEST_MATH_FUNCTION(sqrt)({0ll, 1ll, 2ll, 3ll, 5ll, 7ll, 11ll});
+  TEST_MATH_FUNCTION(sqrt)({0u, 1u, 2u, 3u, 5u, 7u});
+  TEST_MATH_FUNCTION(sqrt)({0ul, 1ul, 2ul, 3ul, 5ul, 7ul});
+  TEST_MATH_FUNCTION(sqrt)({0ull, 1ull, 2ull, 3ull, 5ull, 7ull});
+  TEST_MATH_FUNCTION(sqrt)({10.f, 20.f, 30.f, 40.f});
+  TEST_MATH_FUNCTION(sqrt)({11.1, 22.2, 33.3, 44.4});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(sqrt)({10.l, 20.l, 30.l, 40.l});
+#endif
+
+  TEST_MATH_FUNCTION(cbrt)({-5, -3, -1, 2, 4, 6});
+  TEST_MATH_FUNCTION(cbrt)({-5l, -3l, -1l, 2l, 4l, 6l});
+  TEST_MATH_FUNCTION(cbrt)({-5ll, -3ll, -1ll, 2ll, 4ll, 6ll});
+  TEST_MATH_FUNCTION(cbrt)({0u, 1u, 2u, 3u, 4u, 5u});
+  TEST_MATH_FUNCTION(cbrt)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
+  TEST_MATH_FUNCTION(cbrt)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_MATH_FUNCTION(cbrt)({-1.f, .2f, -3.f, .4f, -5.f});
+  TEST_MATH_FUNCTION(cbrt)({11.1, -2.2, 33.3, -4.4, 55.5});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(cbrt)({-10.l, 20.l, -30.l, 40.l, -50.l});
+#endif
+
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.f, 3.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2., 3.);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.l, 3.l);
+#endif
+
+  do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))  // FIXME
+  do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l);
+#endif
+#endif
+}
+
+TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
+  TEST_MATH_FUNCTION(exp)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0});
+  TEST_MATH_FUNCTION(exp)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l});
+  TEST_MATH_FUNCTION(exp)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll});
+  TEST_MATH_FUNCTION(exp)({0u, 1u, 2u, 3u, 4u, 5u});
+  TEST_MATH_FUNCTION(exp)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
+  TEST_MATH_FUNCTION(exp)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_MATH_FUNCTION(exp)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_MATH_FUNCTION(exp)({-98., -7.6, -.54, 3.2, 1., -0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(exp)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l});
+#endif
+
+  TEST_MATH_FUNCTION(exp2)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0});
+  TEST_MATH_FUNCTION(exp2)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l});
+  TEST_MATH_FUNCTION(exp2)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll});
+  TEST_MATH_FUNCTION(exp2)({0u, 1u, 2u, 3u, 4u, 5u});
+  TEST_MATH_FUNCTION(exp2)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
+  TEST_MATH_FUNCTION(exp2)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_MATH_FUNCTION(exp2)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_MATH_FUNCTION(exp2)({-98., -7.6, -.54, 3.2, 1., -0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(exp2)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l});
+#endif
+
+  TEST_MATH_FUNCTION(expm1)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0});
+  TEST_MATH_FUNCTION(expm1)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l});
+  TEST_MATH_FUNCTION(expm1)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll});
+  TEST_MATH_FUNCTION(expm1)({0u, 1u, 2u, 3u, 4u, 5u});
+  TEST_MATH_FUNCTION(expm1)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul});
+  TEST_MATH_FUNCTION(expm1)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull});
+  TEST_MATH_FUNCTION(expm1)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f});
+  TEST_MATH_FUNCTION(expm1)({-98., -7.6, -.54, 3.2, 1., -0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(expm1)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l});
+#endif
+
+  TEST_MATH_FUNCTION(log)({1, 23, 456, 7890});
+  TEST_MATH_FUNCTION(log)({1l, 23l, 456l, 7890l});
+  TEST_MATH_FUNCTION(log)({1ll, 23ll, 456ll, 7890ll});
+  TEST_MATH_FUNCTION(log)({1u, 23u, 456u, 7890u});
+  TEST_MATH_FUNCTION(log)({1ul, 23ul, 456ul, 7890ul});
+  TEST_MATH_FUNCTION(log)({1ull, 23ull, 456ull, 7890ull});
+  TEST_MATH_FUNCTION(log)({1234.f, 567.f, 89.f, .1f});
+  TEST_MATH_FUNCTION(log)({1234., 567., 89., .02});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(log)({1234.l, 567.l, 89.l, .003l});
+#endif
+
+  TEST_MATH_FUNCTION(log10)({1, 23, 456, 7890});
+  TEST_MATH_FUNCTION(log10)({1l, 23l, 456l, 7890l});
+  TEST_MATH_FUNCTION(log10)({1ll, 23ll, 456ll, 7890ll});
+  TEST_MATH_FUNCTION(log10)({1u, 23u, 456u, 7890u});
+  TEST_MATH_FUNCTION(log10)({1ul, 23ul, 456ul, 7890ul});
+  TEST_MATH_FUNCTION(log10)({1ull, 23ull, 456ull, 7890ull});
+  TEST_MATH_FUNCTION(log10)({1234.f, 567.f, 89.f, .1f});
+  TEST_MATH_FUNCTION(log10)({1234., 567., 89., .02});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l});
+#endif
+
+  TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
+  TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l});
+  TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll});
+  TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
+  TEST_MATH_FUNCTION(log2)({1ul, 23ul, 456ul, 7890ul});
+  TEST_MATH_FUNCTION(log2)({1ull, 23ull, 456ull, 7890ull});
+  TEST_MATH_FUNCTION(log2)({1234.f, 567.f, 89.f, .1f});
+  TEST_MATH_FUNCTION(log2)({1234., 567., 89., .02});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(log2)({1234.l, 567.l, 89.l, .003l});
+#endif
+
+  TEST_MATH_FUNCTION(log1p)({1, 23, 456, 7890, 0});
+  TEST_MATH_FUNCTION(log1p)({1l, 23l, 456l, 7890l, 0l});
+  TEST_MATH_FUNCTION(log1p)({1ll, 23ll, 456ll, 7890ll, 0ll});
+  TEST_MATH_FUNCTION(log1p)({1u, 23u, 456u, 7890u, 0u});
+  TEST_MATH_FUNCTION(log1p)({1ul, 23ul, 456ul, 7890ul, 0ul});
+  TEST_MATH_FUNCTION(log1p)({1ull, 23ull, 456ull, 7890ull, 0ull});
+  TEST_MATH_FUNCTION(log1p)({1234.f, 567.f, 89.f, -.9f});
+  TEST_MATH_FUNCTION(log1p)({1234., 567., 89., -.08});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(log1p)({1234.l, 567.l, 89.l, -.007l});
+#endif
+}
+
+TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
+  TEST_MATH_FUNCTION(sinh)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(sinh)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(sinh)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(sinh)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(sinh)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(sinh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(sinh)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(sinh)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(sinh)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(cosh)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(cosh)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(cosh)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(cosh)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(cosh)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(cosh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(cosh)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(cosh)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(cosh)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(tanh)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(tanh)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(tanh)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(tanh)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(tanh)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(tanh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(tanh)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(tanh)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(tanh)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(asinh)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(asinh)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(asinh)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(asinh)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(asinh)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(asinh)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(asinh)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(asinh)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(asinh)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(acosh)({1, 2, 3, 4, 5, 6});
+  TEST_MATH_FUNCTION(acosh)({1l, 2l, 3l, 4l, 5l, 6l});
+  TEST_MATH_FUNCTION(acosh)({1ll, 2ll, 3ll, 4ll, 5ll, 6ll});
+  TEST_MATH_FUNCTION(acosh)({1u, 2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(acosh)({1ul, 2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(acosh)({1ull, 2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(acosh)({1.2f, 34.f, 56.f, 789.f});
+  TEST_MATH_FUNCTION(acosh)({1.2, 34., 56., 789.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(acosh)({1.2l, 34.l, 56.l, 789.l});
+#endif
+
+  TEST_MATH_FUNCTION(atanh)({0});
+  TEST_MATH_FUNCTION(atanh)({0l});
+  TEST_MATH_FUNCTION(atanh)({0ll});
+  TEST_MATH_FUNCTION(atanh)({0u});
+  TEST_MATH_FUNCTION(atanh)({0ul});
+  TEST_MATH_FUNCTION(atanh)({0ull});
+  TEST_MATH_FUNCTION(atanh)({-.97f, .86f, -.53f, .42f, -.1f, 0.f});
+  TEST_MATH_FUNCTION(atanh)({-.97, .86, -.53, .42, -.1, 0.});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(atanh)({-.97l, .86l, -.53l, .42l, -.1l, 0.l});
+#endif
+}
+
+TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
+  TEST_MATH_FUNCTION(erf)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(erf)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(erf)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(erf)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(erf)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(erf)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(erf)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(erf)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(erf)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(erfc)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(erfc)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(erfc)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(erfc)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(erfc)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(erfc)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(erfc)({.1f, -2.f, 3.f});
+  TEST_MATH_FUNCTION(erfc)({-4., .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(erfc)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(tgamma)({1, 2, 3, 4, 56, 78});
+  TEST_MATH_FUNCTION(tgamma)({1l, 2l, 3l, 4l, 56l, 78l});
+  TEST_MATH_FUNCTION(tgamma)({1ll, 2ll, 3ll, 4ll, 56ll, 78ll});
+  TEST_MATH_FUNCTION(tgamma)({1u, 2u, 3u, 4u, 56u, 78u});
+  TEST_MATH_FUNCTION(tgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul});
+  TEST_MATH_FUNCTION(tgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull});
+  TEST_MATH_FUNCTION(tgamma)({.1f, -2.2f, 3.f});
+  TEST_MATH_FUNCTION(tgamma)({-4.4, .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(tgamma)({.7l, .8l, .9l});
+#endif
+
+  TEST_MATH_FUNCTION(lgamma)({1, 2, 3, 4, 56, 78});
+  TEST_MATH_FUNCTION(lgamma)({1l, 2l, 3l, 4l, 56l, 78l});
+  TEST_MATH_FUNCTION(lgamma)({1ll, 2ll, 3ll, 4ll, 56ll, 78ll});
+  TEST_MATH_FUNCTION(lgamma)({1u, 2u, 3u, 4u, 56u, 78u});
+  TEST_MATH_FUNCTION(lgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul});
+  TEST_MATH_FUNCTION(lgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull});
+  TEST_MATH_FUNCTION(lgamma)({.1f, -2.2f, 3.f});
+  TEST_MATH_FUNCTION(lgamma)({-4.4, .5, -.6});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(lgamma)({.7l, .8l, .9l});
+#endif
+}
+
+TEST(TEST_CATEGORY,
+     mathematical_functions_nearest_interger_floating_point_operations) {
+  TEST_MATH_FUNCTION(ceil)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(ceil)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(ceil)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(ceil)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(ceil)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(ceil)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(ceil)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_MATH_FUNCTION(ceil)({-6.6, 7.7, -8.8, 9.9});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(ceil)({12.3l, 4.56l, 789.l});
+#endif
+
+  TEST_MATH_FUNCTION(floor)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(floor)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(floor)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(floor)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(floor)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(floor)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(floor)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_MATH_FUNCTION(floor)({-6.6, 7.7, -8.8, 9.9});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(floor)({12.3l, 4.56l, 789.l});
+#endif
+
+  TEST_MATH_FUNCTION(trunc)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(trunc)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(trunc)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(trunc)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(trunc)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(trunc)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(trunc)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_MATH_FUNCTION(trunc)({-6.6, 7.7, -8.8, 9.9});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(trunc)({12.3l, 4.56l, 789.l});
+#endif
+
+#ifndef KOKKOS_ENABLE_SYCL
+  TEST_MATH_FUNCTION(nearbyint)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(nearbyint)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(nearbyint)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(nearbyint)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(nearbyint)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(nearbyint)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(nearbyint)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f});
+  TEST_MATH_FUNCTION(nearbyint)({-6.6, 7.7, -8.8, 9.9});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(nearbyint)({12.3l, 4.56l, 789.l});
+#endif
+#endif
+}
diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index 3ee4a25ec03b257ad7a13e8045baaa0fd9be1e9f..6c8a47a5861dd361364a94551abcfd50d0e85153 100644
--- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -173,17 +173,17 @@ struct my_complex {
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator==(const my_complex &src) {
+  bool operator==(const my_complex &src) const {
     return (re == src.re) && (im == src.im) && (dummy == src.dummy);
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const my_complex &src) {
+  bool operator!=(const my_complex &src) const {
     return (re != src.re) || (im != src.im) || (dummy != src.dummy);
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const double &val) {
+  bool operator!=(const double &val) const {
     return (re != val) || (im != 0) || (dummy != 0);
   }
 
diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe01b83834f26eddc15e71360d77e85452ef0238
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -0,0 +1,336 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <type_traits>
+#include "Kokkos_NumericTraits.hpp"
+#include "Kokkos_ExecPolicy.hpp"
+
+struct extrema {
+#define DEFINE_EXTREMA(T, m, M)                 \
+  KOKKOS_FUNCTION static T min(T) { return m; } \
+  KOKKOS_FUNCTION static T max(T) { return M; }
+
+  DEFINE_EXTREMA(char, CHAR_MIN, CHAR_MAX);
+  DEFINE_EXTREMA(signed char, SCHAR_MIN, SCHAR_MAX);
+  DEFINE_EXTREMA(unsigned char, 0, UCHAR_MAX);
+  DEFINE_EXTREMA(short, SHRT_MIN, SHRT_MAX);
+  DEFINE_EXTREMA(unsigned short, 0, USHRT_MAX);
+  DEFINE_EXTREMA(int, INT_MIN, INT_MAX);
+  DEFINE_EXTREMA(unsigned, 0U, UINT_MAX);
+  DEFINE_EXTREMA(long, LONG_MIN, LONG_MAX);
+  DEFINE_EXTREMA(unsigned long, 0UL, ULONG_MAX);
+  DEFINE_EXTREMA(long long, LLONG_MIN, LLONG_MAX);
+  DEFINE_EXTREMA(unsigned long long, 0ULL, ULLONG_MAX);
+
+  DEFINE_EXTREMA(float, -FLT_MAX, FLT_MAX);
+  DEFINE_EXTREMA(double, -DBL_MAX, DBL_MAX);
+  DEFINE_EXTREMA(long double, -LDBL_MAX, LDBL_MAX);
+
+#undef DEFINE_EXTREMA
+};
+
+// clang-format off
+struct Infinity { template <class T> using trait = Kokkos::Experimental::infinity<T>; };
+struct Epsilon { template <class T> using trait = Kokkos::Experimental::epsilon<T>; };
+struct FiniteMin { template <class T> using trait = Kokkos::Experimental::finite_min<T>; };
+struct FiniteMax { template <class T> using trait = Kokkos::Experimental::finite_max<T>; };
+struct RoundError { template <class T> using trait = Kokkos::Experimental::round_error<T>; };
+struct NormMin { template <class T> using trait = Kokkos::Experimental::norm_min<T>; };
+struct Digits { template <class T> using trait = Kokkos::Experimental::digits<T>; };
+struct Digits10 { template <class T> using trait = Kokkos::Experimental::digits10<T>; };
+struct MaxDigits10 { template <class T> using trait = Kokkos::Experimental::max_digits10<T>; };
+struct Radix { template <class T> using trait = Kokkos::Experimental::radix<T>; };
+struct MinExponent { template <class T> using trait = Kokkos::Experimental::min_exponent<T>; };
+struct MaxExponent { template <class T> using trait = Kokkos::Experimental::max_exponent<T>; };
+struct MinExponent10 { template <class T> using trait = Kokkos::Experimental::min_exponent10<T>; };
+struct MaxExponent10 { template <class T> using trait = Kokkos::Experimental::max_exponent10<T>; };
+// clang-format on
+
+template <class T>
+KOKKOS_FUNCTION T* take_address_of(T& arg) {
+  return &arg;
+}
+
+template <class T>
+KOKKOS_FUNCTION void take_by_value(T) {}
+
+template <class Space, class T, class Tag>
+struct TestNumericTraits {
+  template <class U>
+  using trait = typename Tag::template trait<U>;
+
+  Kokkos::View<T, Space> compare;
+  TestNumericTraits() {
+    compare = Kokkos::View<T, Space>("C");
+    run();
+  }
+
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space, Tag>(0, 1), *this,
+                            errors);
+    ASSERT_EQ(errors, 0);
+    (void)take_address_of(trait<T>::value);  // use on host
+  }
+
+  KOKKOS_FUNCTION void operator()(Infinity, int, int& e) const {
+    using Kokkos::Experimental::infinity;
+    auto const inf  = infinity<T>::value;
+    auto const zero = T(0);
+    e += (int)!(inf + inf == inf);
+    e += (int)!(inf != zero);
+    use_on_device();
+  }
+
+  KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const {
+    using Kokkos::Experimental::epsilon;
+    auto const eps = epsilon<T>::value;
+    auto const one = T(1);
+    // Avoid higher precision intermediate representation
+    compare() = one + eps;
+    e += (int)!(compare() != one);
+    compare() = one + eps / 2;
+    e += (int)!(compare() == one);
+    use_on_device();
+  }
+
+  KOKKOS_FUNCTION void operator()(FiniteMin, int, int& e) const {
+    using Kokkos::Experimental::finite_max;
+    using Kokkos::Experimental::finite_min;
+    auto const min = finite_min<T>::value;
+    auto const max = finite_max<T>::value;
+    e += (int)!(min == extrema::min(T{}));
+    e += (int)!(max == extrema::max(T{}));
+    use_on_device();
+  }
+
+  // clang-format off
+  KOKKOS_FUNCTION void operator()(FiniteMax, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(RoundError, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(NormMin, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(Digits, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(Digits10, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(MaxDigits10, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(Radix, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(MinExponent, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(MaxExponent, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(MinExponent10, int, int&) const { use_on_device(); }
+  KOKKOS_FUNCTION void operator()(MaxExponent10, int, int&) const { use_on_device(); }
+  // clang-format on
+
+  KOKKOS_FUNCTION void use_on_device() const {
+#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+    take_by_value(trait<T>::value);
+#else
+    (void)take_address_of(trait<T>::value);
+#endif
+  }
+};
+
+#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_SYCL) || \
+    defined(KOKKOS_ENABLE_OPENMPTARGET)
+template <class Tag>
+struct TestNumericTraits<
+#if defined(KOKKOS_ENABLE_CUDA)
+    Kokkos::Cuda,
+#elif defined(KOKKOS_ENABLE_SYCL)
+    Kokkos::Experimental::SYCL,
+#else
+    Kokkos::Experimental::OpenMPTarget,
+#endif
+    long double, Tag> {
+  template <class T>
+  using trait = typename Tag::template trait<T>;
+  TestNumericTraits() {
+    (void)take_address_of(trait<long double>::value);
+    // Do nothing on the device.
+    // According to the doc
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-variables
+    // the traits member constant value cannot be directly used in device code.
+  }
+};
+#endif
+
+TEST(TEST_CATEGORY, numeric_traits_infinity) {
+  TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
+  TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
+  TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_epsilon) {
+  TestNumericTraits<TEST_EXECSPACE, float, Epsilon>();
+  TestNumericTraits<TEST_EXECSPACE, double, Epsilon>();
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+  TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>();
+#endif
+}
+
+TEST(TEST_CATEGORY, numeric_traits_round_error) {
+  TestNumericTraits<TEST_EXECSPACE, float, RoundError>();
+  TestNumericTraits<TEST_EXECSPACE, double, RoundError>();
+  TestNumericTraits<TEST_EXECSPACE, long double, RoundError>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_norm_min) {
+  TestNumericTraits<TEST_EXECSPACE, float, NormMin>();
+  TestNumericTraits<TEST_EXECSPACE, double, NormMin>();
+  TestNumericTraits<TEST_EXECSPACE, long double, NormMin>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_finite_min_max) {
+  TestNumericTraits<TEST_EXECSPACE, char, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, char, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, signed char, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, signed char, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned char, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned char, FiniteMax>();
+
+  TestNumericTraits<TEST_EXECSPACE, short, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, short, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned short, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned short, FiniteMax>();
+
+  TestNumericTraits<TEST_EXECSPACE, int, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, int, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned int, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned int, FiniteMax>();
+
+  TestNumericTraits<TEST_EXECSPACE, long, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, long, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long, FiniteMax>();
+
+  TestNumericTraits<TEST_EXECSPACE, long long, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, long long, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long long, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long long, FiniteMax>();
+
+  TestNumericTraits<TEST_EXECSPACE, float, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>();
+  TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>();
+  TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_digits) {
+  TestNumericTraits<TEST_EXECSPACE, bool, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, char, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, signed char, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned char, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, short, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned short, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, long int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, long long int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, float, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, double, Digits>();
+  TestNumericTraits<TEST_EXECSPACE, long double, Digits>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_digits10) {
+  TestNumericTraits<TEST_EXECSPACE, bool, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, char, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, signed char, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned char, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, short, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned short, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, long int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, long long int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, float, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, double, Digits10>();
+  TestNumericTraits<TEST_EXECSPACE, long double, Digits10>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_max_digits10) {
+  TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>();
+  TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>();
+  TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_radix) {
+  TestNumericTraits<TEST_EXECSPACE, bool, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, char, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, signed char, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned char, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, short, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned short, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, long int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, long long int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, float, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, double, Radix>();
+  TestNumericTraits<TEST_EXECSPACE, long double, Radix>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) {
+  TestNumericTraits<TEST_EXECSPACE, float, MinExponent>();
+  TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>();
+  TestNumericTraits<TEST_EXECSPACE, double, MinExponent>();
+  TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>();
+  TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>();
+  TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
+  TestNumericTraits<TEST_EXECSPACE, float, MinExponent10>();
+  TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>();
+  TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>();
+  TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>();
+  TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
+  TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
+}
diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 405782b8f95fb031dd7eb5403c5556661170c6fc..0017c690e75c6e1bde1808e87203d8dbbea754cc 100644
--- a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -48,6 +48,7 @@
 #include <stdexcept>
 #include <sstream>
 #include <iostream>
+#include <type_traits>
 
 namespace Test {
 struct SomeTag {};
@@ -579,7 +580,10 @@ class TestTeamPolicyConstruction {
     policy_t p1(league_size, team_size);
     ASSERT_EQ(p1.league_size(), league_size);
     ASSERT_EQ(p1.team_size(), team_size);
+// FIXME_SYCL implement chunk_size
+#ifndef KOKKOS_ENABLE_SYCL
     ASSERT_TRUE(p1.chunk_size() > 0);
+#endif
     ASSERT_EQ(p1.scratch_size(0), 0);
 
     policy_t p2 = p1.set_chunk_size(chunk_size);
@@ -692,10 +696,7 @@ TEST(TEST_CATEGORY, policy_construction) {
   check_semiregular<Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>>();
 
   TestRangePolicyConstruction<TEST_EXECSPACE>();
-  // FIXME_SYCL requires Team policy
-#ifndef KOKKOS_ENABLE_SYCL
   TestTeamPolicyConstruction<TEST_EXECSPACE>();
-#endif
 }
 
 template <template <class...> class Policy, class... Args>
@@ -709,13 +710,10 @@ void check_converting_constructor_add_work_tag(Policy<Args...> const& policy) {
 TEST(TEST_CATEGORY, policy_converting_constructor_from_other_policy) {
   check_converting_constructor_add_work_tag(
       Kokkos::RangePolicy<TEST_EXECSPACE>{});
-  // FIXME_SYCL requires MDRange policy and Team policy
-#ifndef KOKKOS_ENABLE_SYCL
   check_converting_constructor_add_work_tag(
       Kokkos::TeamPolicy<TEST_EXECSPACE>{});
   check_converting_constructor_add_work_tag(
       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{});
-#endif
 }
 
 #ifndef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
@@ -767,32 +765,47 @@ void test_prefer_desired_occupancy(Policy const& policy) {
 template <class... Args>
 struct DummyPolicy : Kokkos::Impl::PolicyTraits<Args...> {
   using execution_policy = DummyPolicy;
-  using traits           = Kokkos::Impl::PolicyTraits<Args...>;
-  template <class... OtherArgs>
-  DummyPolicy(DummyPolicy<OtherArgs...> const& p) : traits(p) {}
-  DummyPolicy() = default;
+
+  using base_t = Kokkos::Impl::PolicyTraits<Args...>;
+  using base_t::base_t;
 };
 
 TEST(TEST_CATEGORY, desired_occupancy_prefer) {
   test_prefer_desired_occupancy(DummyPolicy<TEST_EXECSPACE>{});
   test_prefer_desired_occupancy(Kokkos::RangePolicy<TEST_EXECSPACE>{});
-  // FIXME_SYCL requires MDRange policy and Team policy
-#ifndef KOKKOS_ENABLE_SYCL
   test_prefer_desired_occupancy(
       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{});
   test_prefer_desired_occupancy(Kokkos::TeamPolicy<TEST_EXECSPACE>{});
-#endif
 }
 
+// For a more informative static assertion:
+template <size_t>
+struct static_assert_dummy_policy_must_be_size_one;
+template <>
+struct static_assert_dummy_policy_must_be_size_one<1> {};
+template <size_t, size_t>
+struct static_assert_dummy_policy_must_be_size_of_desired_occupancy;
+template <>
+struct static_assert_dummy_policy_must_be_size_of_desired_occupancy<
+    sizeof(Kokkos::Experimental::DesiredOccupancy),
+    sizeof(Kokkos::Experimental::DesiredOccupancy)> {};
+
 TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   DummyPolicy<TEST_EXECSPACE> const policy{};
   static_assert(sizeof(decltype(policy)) == 1, "");
+  static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))>
+      _assert1{};
+  (void)_assert1;  // avoid unused variable warning
 
   using Kokkos::Experimental::DesiredOccupancy;
   auto policy_with_occ =
       Kokkos::Experimental::prefer(policy, DesiredOccupancy{50});
   static_assert(sizeof(decltype(policy_with_occ)) == sizeof(DesiredOccupancy),
                 "");
+  static_assert_dummy_policy_must_be_size_of_desired_occupancy<
+      sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)>
+      _assert2{};
+  (void)_assert2;  // avoid unused variable warning
 }
 
 template <typename Policy>
@@ -809,16 +822,12 @@ void test_desired_occupancy_converting_constructors(Policy const& policy) {
 TEST(TEST_CATEGORY, desired_occupancy_converting_constructors) {
   test_desired_occupancy_converting_constructors(
       Kokkos::RangePolicy<TEST_EXECSPACE>{});
-  // FIXME_SYCL requires MDRange policy and Team policy
-#ifndef KOKKOS_ENABLE_SYCL
   test_desired_occupancy_converting_constructors(
       Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{});
   test_desired_occupancy_converting_constructors(
       Kokkos::TeamPolicy<TEST_EXECSPACE>{});
-#endif
 }
 
-#ifndef KOKKOS_ENABLE_SYCL
 template <class T>
 void more_md_range_policy_construction_test() {
   (void)Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{
@@ -878,6 +887,30 @@ TEST(TEST_CATEGORY, md_range_policy_construction_from_arrays) {
   more_md_range_policy_construction_test<unsigned long>();
   more_md_range_policy_construction_test<std::int64_t>();
 }
-#endif
 
+template <class WorkTag, class Policy>
+constexpr auto set_worktag(Policy const& policy) {
+  static_assert(Kokkos::is_execution_policy<Policy>::value, "");
+  using PolicyWithWorkTag =
+      Kokkos::Impl::WorkTagTrait::policy_with_trait<Policy, WorkTag>;
+  return PolicyWithWorkTag{policy};
+}
+
+TEST(TEST_CATEGORY, policy_set_worktag) {
+  struct SomeWorkTag {};
+  struct OtherWorkTag {};
+
+  Kokkos::RangePolicy<> p1;
+  static_assert(std::is_void<decltype(p1)::work_tag>::value, "");
+
+  auto p2 = set_worktag<SomeWorkTag>(p1);
+  static_assert(std::is_same<decltype(p2)::work_tag, SomeWorkTag>::value, "");
+
+  auto p3 = set_worktag<OtherWorkTag>(p2);
+  static_assert(std::is_same<decltype(p3)::work_tag, OtherWorkTag>::value, "");
+
+  // NOTE this does not currently compile
+  // auto p4 = set_worktag<void>(p3);
+  // static_assert(std::is_void<decltype(p4)::work_tag>::value, "");
+}
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp
index 1f14ae4f30502089a3e90afab96b20ecc78a47e8..a6a6220f2dceea470414fb0d712796689f6d151c 100644
--- a/packages/kokkos/core/unit_test/TestRange.hpp
+++ b/packages/kokkos/core/unit_test/TestRange.hpp
@@ -162,9 +162,8 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyInitTag &, const int i) const {
     if (i != m_flags(i)) {
-#ifndef __SYCL_DEVICE_ONLY__
-      printf("TestRange::test_for_error at %d != %d\n", i, m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
+                                    i, m_flags(i));
     }
   }
 
@@ -176,9 +175,8 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyResetTag &, const int i) const {
     if (2 * i != m_flags(i)) {
-#ifndef __SYCL_DEVICE_ONLY__
-      printf("TestRange::test_for_error at %d != %d\n", i, m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
+                                    i, m_flags(i));
     }
   }
 
@@ -190,9 +188,8 @@ struct TestRange {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyOffsetTag &, const int i) const {
     if (i + offset != m_flags(i)) {
-#ifndef __SYCL_DEVICE_ONLY__
-      printf("TestRange::test_for_error at %d != %d\n", i + offset, m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n",
+                                    i + offset, m_flags(i));
     }
   }
 
@@ -275,10 +272,9 @@ struct TestRange {
 
     if (final) {
       if (update != (i * (i + 1)) / 2) {
-#ifndef __SYCL_DEVICE_ONLY__
-        printf("TestRange::test_scan error (%d,%d) : %d != %d\n", i, m_flags(i),
-               (i * (i + 1)) / 2, update);
-#endif
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "TestRange::test_scan error (%d,%d) : %d != %d\n", i, m_flags(i),
+            (i * (i + 1)) / 2, update);
       }
       result_view(i) = update;
     }
diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 18ff450a1a330c832d2bd3e3598391bfa703fc8e..693f19613db6beb8c1c2a551574808de26633726 100644
--- a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -170,9 +170,8 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyInitTag &, const int i) const {
     if (i != m_flags(i)) {
-#ifndef KOKKOS_ENABLE_SYCL
-      printf("TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
     }
   }
 
@@ -184,9 +183,8 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyResetTag &, const int i) const {
     if (2 * i != m_flags(i)) {
-#ifndef KOKKOS_ENABLE_SYCL
-      printf("TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i));
     }
   }
 
@@ -198,10 +196,9 @@ struct TestRangeRequire {
   KOKKOS_INLINE_FUNCTION
   void operator()(const VerifyOffsetTag &, const int i) const {
     if (i + offset != m_flags(i)) {
-#ifndef KOKKOS_ENABLE_SYCL
-      printf("TestRangeRequire::test_for error at %d != %d\n", i + offset,
-             m_flags(i));
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "TestRangeRequire::test_for error at %d != %d\n", i + offset,
+          m_flags(i));
     }
   }
 
@@ -268,10 +265,9 @@ struct TestRangeRequire {
 
     if (final) {
       if (update != (i * (i + 1)) / 2) {
-#ifndef KOKKOS_ENABLE_SYCL
-        printf("TestRangeRequire::test_scan error %d : %d != %d\n", i,
-               (i * (i + 1)) / 2, m_flags(i));
-#endif
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "TestRangeRequire::test_scan error %d : %d != %d\n", i,
+            (i * (i + 1)) / 2, m_flags(i));
       }
     }
   }
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
index 9fab5b1f0fa5c42b1ebc5de34dd02fd6f0bca361..5f7fbd5623d6e8e4c25c261a0f092d79c1573fba 100644
--- a/packages/kokkos/core/unit_test/TestReduce.hpp
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -51,6 +51,8 @@
 
 namespace Test {
 
+struct ReducerTag {};
+
 template <typename ScalarType, class DeviceType>
 class ReduceFunctor {
  public:
@@ -110,6 +112,45 @@ class ReduceFunctorFinal : public ReduceFunctor<int64_t, DeviceType> {
   }
 };
 
+template <class DeviceType>
+class ReduceFunctorFinalTag {
+ public:
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;
+  using ScalarType      = int64_t;
+
+  struct value_type {
+    ScalarType value[3];
+  };
+
+  const size_type nwork;
+
+  KOKKOS_INLINE_FUNCTION
+  ReduceFunctorFinalTag(const size_type arg_nwork) : nwork(arg_nwork) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void join(const ReducerTag, volatile value_type& dst,
+            const volatile value_type& src) const {
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ReducerTag, size_type iwork, value_type& dst) const {
+    dst.value[0] -= 1;
+    dst.value[1] -= iwork + 1;
+    dst.value[2] -= nwork - iwork;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(const ReducerTag, value_type& dst) const {
+    ++dst.value[0];
+    ++dst.value[1];
+    ++dst.value[2];
+  }
+};
+
 template <typename ScalarType, class DeviceType>
 class RuntimeReduceFunctor {
  public:
@@ -141,7 +182,7 @@ class RuntimeReduceFunctor {
   void operator()(size_type iwork, ScalarType dst[]) const {
     const size_type tmp[3] = {1, iwork + 1, nwork - iwork};
 
-    for (size_type i = 0; i < value_count; ++i) {
+    for (size_type i = 0; i < static_cast<size_type>(value_count); ++i) {
       dst[i] += tmp[i % 3];
     }
   }
@@ -189,7 +230,7 @@ class RuntimeReduceMinMax {
     const ScalarType tmp[2] = {ScalarType(iwork + 1),
                                ScalarType(nwork - iwork)};
 
-    for (size_type i = 0; i < value_count; ++i) {
+    for (size_type i = 0; i < static_cast<size_type>(value_count); ++i) {
       dst[i] = i % 2 ? (dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2])
                      : (dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2]);
     }
@@ -260,6 +301,7 @@ class TestReduce {
   TestReduce(const size_type& nwork) {
     run_test(nwork);
     run_test_final(nwork);
+    run_test_final_tag(nwork);
   }
 
   void run_test(const size_type& nwork) {
@@ -314,6 +356,39 @@ class TestReduce {
       }
     }
   }
+
+  void run_test_final_tag(const size_type& nwork) {
+    using functor_type = Test::ReduceFunctorFinalTag<execution_space>;
+    using value_type   = typename functor_type::value_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[Repeat];
+
+    const uint64_t nw   = nwork;
+    const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1);
+
+    for (unsigned i = 0; i < Repeat; ++i) {
+      if (i % 2 == 0) {
+        Kokkos::parallel_reduce(
+            Kokkos::RangePolicy<execution_space, ReducerTag>(0, nwork),
+            functor_type(nwork), result[i]);
+      } else {
+        Kokkos::parallel_reduce(
+            "Reduce",
+            Kokkos::RangePolicy<execution_space, ReducerTag>(0, nwork),
+            functor_type(nwork), result[i]);
+      }
+    }
+
+    for (unsigned i = 0; i < Repeat; ++i) {
+      for (unsigned j = 0; j < Count; ++j) {
+        const uint64_t correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ((ScalarType)correct, 1 - result[i].value[j]);
+      }
+    }
+  }
 };
 
 template <typename ScalarType, class DeviceType>
diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index f8c693b9602fb5b11cd99999813583c772b6e70b..68e7d746dd91a68046c4d074884ef5aef7519427 100644
--- a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -102,8 +102,6 @@ struct FunctorScalar<0> {
   void operator()(const int& i, double& update) const { update += i; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalar<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -117,7 +115,6 @@ struct FunctorScalar<1> {
     update += 1.0 / team.team_size() * team.league_rank();
   }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarInit;
@@ -135,8 +132,6 @@ struct FunctorScalarInit<0> {
   void init(double& update) const { update = 0.0; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarInit<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -153,7 +148,6 @@ struct FunctorScalarInit<1> {
   KOKKOS_INLINE_FUNCTION
   void init(double& update) const { update = 0.0; }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarFinal;
@@ -171,8 +165,6 @@ struct FunctorScalarFinal<0> {
   void final(double& update) const { result() = update; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarFinal<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -189,7 +181,6 @@ struct FunctorScalarFinal<1> {
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarJoin;
@@ -209,8 +200,6 @@ struct FunctorScalarJoin<0> {
   }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarJoin<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -229,7 +218,6 @@ struct FunctorScalarJoin<1> {
     dst += update;
   }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarJoinFinal;
@@ -252,8 +240,6 @@ struct FunctorScalarJoinFinal<0> {
   void final(double& update) const { result() = update; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarJoinFinal<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -275,7 +261,6 @@ struct FunctorScalarJoinFinal<1> {
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarJoinInit;
@@ -298,8 +283,6 @@ struct FunctorScalarJoinInit<0> {
   void init(double& update) const { update = 0.0; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarJoinInit<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -321,7 +304,6 @@ struct FunctorScalarJoinInit<1> {
   KOKKOS_INLINE_FUNCTION
   void init(double& update) const { update = 0.0; }
 };
-#endif
 
 template <int ISTEAM>
 struct FunctorScalarJoinFinalInit;
@@ -347,8 +329,6 @@ struct FunctorScalarJoinFinalInit<0> {
   void init(double& update) const { update = 0.0; }
 };
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 template <>
 struct FunctorScalarJoinFinalInit<1> {
   using team_type = Kokkos::TeamPolicy<>::member_type;
@@ -373,7 +353,6 @@ struct FunctorScalarJoinFinalInit<1> {
   KOKKOS_INLINE_FUNCTION
   void init(double& update) const { update = 0.0; }
 };
-#endif
 
 struct Functor1 {
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp
index a8ffe3c0c245b90981ccbddca7f5085f7d0d18c0..35f0e231fd2a7b1e88bbf4be568532aa5c219e3f 100644
--- a/packages/kokkos/core/unit_test/TestReducers.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers.hpp
@@ -1015,7 +1015,12 @@ struct TestReducers {
     test_minloc(10003);
     test_max(10007);
     test_maxloc(10007);
+    // FIXME_OPENMPTARGET - The minmaxloc test fails in the Release and
+    // RelWithDebInfo builds for the OPENMPTARGET backend but passes in Debug
+    // mode.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
     test_minmaxloc(10007);
+#endif
   }
 
   // NOTE test_prod generates N random numbers between 1 and 4.
@@ -1028,7 +1033,12 @@ struct TestReducers {
     test_minloc(10003);
     test_max(10007);
     test_maxloc(10007);
+    // FIXME_OPENMPTARGET - The minmaxloc test fails in the Release and
+    // RelWithDebInfo builds for the OPENMPTARGET backend but passes in Debug
+    // mode.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
     test_minmaxloc(10007);
+#endif
     test_BAnd(35);
     test_BOr(35);
     test_LAnd(35);
diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp
index 44545a89dd93c0b02d6f9130d2fa96a2dcaa93b3..e2254a1c1fe653b22c3e6b9a9ebad50d07a9eb89 100644
--- a/packages/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp
@@ -54,8 +54,14 @@ TEST(TEST_CATEGORY, reducers_complex_double) {
 TEST(TEST_CATEGORY, reducers_struct) {
   TestReducers<array_reduce<float, 1>, TEST_EXECSPACE>::test_sum(1031);
   TestReducers<array_reduce<float, 2>, TEST_EXECSPACE>::test_sum(1031);
-  TestReducers<array_reduce<float, 3>, TEST_EXECSPACE>::test_sum(1031);
   TestReducers<array_reduce<float, 4>, TEST_EXECSPACE>::test_sum(1031);
+  // FIXME_OPENMPTARGET - The size of data in array_reduce has to be a power of
+  // 2 for OPENMPTARGET backend in Release and RelWithDebInfo builds.
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  TestReducers<array_reduce<float, 8>, TEST_EXECSPACE>::test_sum(1031);
+#else
+  TestReducers<array_reduce<float, 3>, TEST_EXECSPACE>::test_sum(1031);
   TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031);
+#endif
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
index d82709b300dc52110a73d1850cb5cb6378955224..17563de335e5b6a6170985e392ea8ae0de5ae8c1 100644
--- a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
+++ b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
@@ -109,7 +109,12 @@ struct TeamPolicyFunctor {
 }  // namespace
 
 TEST(TEST_CATEGORY, reduce_device_view_range_policy) {
+  // Avoid running out of memory
+#ifdef KOKKOS_ENABLE_SYCL
+  int N = 100 * 1024 * 1024;
+#else
   int N = 1000 * 1024 * 1024;
+#endif
   test_reduce_device_view(N, Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
                           RangePolicyFunctor());
 }
@@ -126,10 +131,19 @@ TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) {
 // FIXME_HIP
 #ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
+// FIXME_SYCL The number of workgroups on CUDA devices can not be larger than
+// 65535
+#ifdef KOKKOS_ENABLE_SYCL
+  int N = 63 * 1024 * 1024;
+  test_reduce_device_view(
+      N, Kokkos::TeamPolicy<TEST_EXECSPACE>(63 * 1024, Kokkos::AUTO),
+      TeamPolicyFunctor(1024));
+#else
   int N = 1000 * 1024 * 1024;
   test_reduce_device_view(
       N, Kokkos::TeamPolicy<TEST_EXECSPACE>(1000 * 1024, Kokkos::AUTO),
       TeamPolicyFunctor(1024));
+#endif
 }
 #endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestResize.hpp b/packages/kokkos/core/unit_test/TestResize.hpp
index 0ab6e10c49e95ec115fb76ed16ac7df10b256534..cf5c0df6f9163039fbd3ca1df8aee2a4b24ac882 100644
--- a/packages/kokkos/core/unit_test/TestResize.hpp
+++ b/packages/kokkos/core/unit_test/TestResize.hpp
@@ -76,8 +76,6 @@ void impl_testResize() {
     const int* newPointer = view_1d.data();
     EXPECT_TRUE(oldPointer == newPointer);
   }
-  // FIXME_SYCL needs MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   {
     using view_type = Kokkos::View<int**, DeviceType>;
     view_type view_2d("view_2d", sizes[0], sizes[1]);
@@ -149,7 +147,6 @@ void impl_testResize() {
     const int* newPointer = view_8d.data();
     EXPECT_TRUE(oldPointer == newPointer);
   }
-#endif
   // Resize without initialization: check if data preserved
   {
     using view_type = Kokkos::View<int*, DeviceType>;
@@ -172,8 +169,6 @@ void impl_testResize() {
     }
     EXPECT_TRUE(test == true);
   }
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   {
     using view_type = Kokkos::View<int**, DeviceType>;
     view_type view_2d("view_2d", sizes[0], sizes[1]);
@@ -389,7 +384,6 @@ void impl_testResize() {
     }
     EXPECT_TRUE(test == true);
   }
-#endif
 }
 
 template <class DeviceType>
diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp
index 138570f445ea69efcae9311abeabb036d68af45b..67cb85553d6bf7ccd9cc76b85f7bc32bb0e2e5a7 100644
--- a/packages/kokkos/core/unit_test/TestScan.hpp
+++ b/packages/kokkos/core/unit_test/TestScan.hpp
@@ -75,15 +75,11 @@ struct TestScan {
       if (answer != update) {
         int fail = errors()++;
 
-        // FIXME_SYCL
-#ifndef KOKKOS_ENABLE_SYCL
         if (fail < 20) {
-          printf("TestScan(%d,%ld) != %ld\n", iwork, static_cast<long>(update),
-                 static_cast<long>(answer));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestScan(%d,%ld) != %ld\n", iwork,
+                                        static_cast<long>(update),
+                                        static_cast<long>(answer));
         }
-#else
-        (void)fail;
-#endif
       }
     }
   }
diff --git a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
index bb00a95824909cde25b557f652670753cb8c12f1..b5eb77dc2a964fe1066048b2edfac61d531b4fab 100644
--- a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
+++ b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -239,6 +239,9 @@ TEST(TEST_CATEGORY, impl_shared_alloc) {
 #elif (TEST_CATEGORY_NUMBER == 6)  // hip
   test_shared_alloc<Kokkos::Experimental::HIPSpace,
                     Kokkos::DefaultHostExecutionSpace>();
+#elif (TEST_CATEGORY_NUMBER == 7)  // sycl
+  test_shared_alloc<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::DefaultHostExecutionSpace>();
 #endif
 #else
   test_shared_alloc<TEST_EXECSPACE, Kokkos::DefaultHostExecutionSpace>();
diff --git a/packages/kokkos/core/unit_test/TestCuda_Category.hpp b/packages/kokkos/core/unit_test/TestSubView_c14.hpp
similarity index 88%
rename from packages/kokkos/core/unit_test/TestCuda_Category.hpp
rename to packages/kokkos/core/unit_test/TestSubView_c14.hpp
index 7c572e3a0887527e48ae32ea1ca9f641f2fbb22e..e6510c83a603481a9b8de0367894ec98407faba3 100644
--- a/packages/kokkos/core/unit_test/TestCuda_Category.hpp
+++ b/packages/kokkos/core/unit_test/TestSubView_c14.hpp
@@ -42,14 +42,15 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_TEST_CUDA_HPP
-#define KOKKOS_TEST_CUDA_HPP
+#ifndef KOKKOS_TEST_SUBVIEW_C14_HPP
+#define KOKKOS_TEST_SUBVIEW_C14_HPP
+#include <TestViewSubview.hpp>
 
-#include <gtest/gtest.h>
+namespace Test {
 
-#define TEST_CATEGORY cuda
-#define TEST_CATEGORY_NUMBER 5
-#define TEST_CATEGORY_DEATH cuda_DeathTest
-#define TEST_EXECSPACE Kokkos::Cuda
+TEST(TEST_CATEGORY, view_subview_memory_traits_construction) {
+  TestViewSubview::test_subview_memory_traits_construction();
+}
 
+}  // namespace Test
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index 628def9be5b71635a260b6773a6b347858742677..97ddfd4cf58518bfa494eedf4445ba68fdb1132a 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -62,10 +62,18 @@ struct TestTeamPolicy {
   view_type m_flags;
 
   TestTeamPolicy(const size_t league_size)
-      : m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
-                Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
-                    *this, Kokkos::ParallelReduceTag()),
-                league_size) {}
+      : m_flags(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
+  // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+            Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 32).team_size_max(
+                *this, Kokkos::ParallelReduceTag()),
+#else
+            Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
+                *this, Kokkos::ParallelReduceTag()),
+#endif
+            league_size) {
+  }
 
   struct VerifyInitTag {};
 
@@ -87,9 +95,10 @@ struct TestTeamPolicy {
         member.team_rank() + member.team_size() * member.league_rank();
 
     if (tid != m_flags(member.team_rank(), member.league_rank())) {
-      printf("TestTeamPolicy member(%d,%d) error %d != %d\n",
-             member.league_rank(), member.team_rank(), tid,
-             m_flags(member.team_rank(), member.league_rank()));
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "TestTeamPolicy member(%d,%d) error %d != %d\n", member.league_rank(),
+          member.team_rank(), tid,
+          m_flags(member.team_rank(), member.league_rank()));
     }
   }
 
@@ -120,31 +129,57 @@ struct TestTeamPolicy {
 
   static void test_constructors() {
     constexpr const int smallest_work = 1;
+    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(smallest_work, 32,
+                                                     smallest_work);
+#else
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
+#endif
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
+    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
+                                                       Kokkos::AUTO());
+#else
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
+#endif
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
         smallest_work, Kokkos::AUTO(), smallest_work);
   }
 
   static void test_for(const size_t league_size) {
-    TestTeamPolicy functor(league_size);
-    using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
-    using policy_type_init =
-        Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;
-
-    const int team_size = policy_type(league_size, 1)
-                              .team_size_max(functor, Kokkos::ParallelForTag());
-    const int team_size_init =
-        policy_type_init(league_size, 1)
-            .team_size_max(functor, Kokkos::ParallelForTag());
+    {
+      TestTeamPolicy functor(league_size);
+      using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
+      using policy_type_init =
+          Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;
+
+      // FIXME_OPENMPTARGET temporary restriction for team size to be at least
+      // 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+      const int team_size =
+          policy_type(league_size, 32)
+              .team_size_max(functor, Kokkos::ParallelForTag());
+      const int team_size_init =
+          policy_type_init(league_size, 32)
+              .team_size_max(functor, Kokkos::ParallelForTag());
+#else
+      const int team_size =
+          policy_type(league_size, 1)
+              .team_size_max(functor, Kokkos::ParallelForTag());
+      const int team_size_init =
+          policy_type_init(league_size, 1)
+              .team_size_max(functor, Kokkos::ParallelForTag());
+#endif
 
-    Kokkos::parallel_for(policy_type(league_size, team_size), functor);
-    Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
-                         functor);
+      Kokkos::parallel_for(policy_type(league_size, team_size), functor);
+      Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
+                           functor);
+    }
 
     test_small_league_size();
     test_constructors();
@@ -173,9 +208,16 @@ struct TestTeamPolicy {
     using policy_type_reduce =
         Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>;
 
+    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    const int team_size =
+        policy_type_reduce(league_size, 32)
+            .team_size_max(functor, Kokkos::ParallelReduceTag());
+#else
     const int team_size =
         policy_type_reduce(league_size, 1)
             .team_size_max(functor, Kokkos::ParallelReduceTag());
+#endif
 
     const int64_t N = team_size * league_size;
 
@@ -353,7 +395,7 @@ class ScanTeamFunctor {
     ind.team_reduce(Kokkos::Max<int64_t>(m));
 
     if (m != ind.league_rank() + (ind.team_size() - 1)) {
-      printf(
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
           "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
           "reduce_max(%li)\n",
           static_cast<int>(ind.league_rank()),
@@ -375,7 +417,7 @@ class ScanTeamFunctor {
         ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
 
     if (answer != result || answer != result2) {
-      printf(
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
           "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
           "scan_second(%li)\n",
           static_cast<int>(ind.league_rank()),
@@ -476,7 +518,7 @@ struct SharedTeamFunctor {
 
     if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
         (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
-      printf(
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
           "member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
           "%lu\n",
           static_cast<int>(ind.league_rank()),
@@ -522,12 +564,21 @@ struct TestSharedTeam {
         Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
                      Kokkos::MemoryUnmanaged>;
 
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    const size_t team_size =
+        Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
+            Functor(), Kokkos::ParallelReduceTag());
+
+    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(32 / team_size,
+                                                          team_size);
+#else
     const size_t team_size =
         Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
             Functor(), Kokkos::ParallelReduceTag());
 
     Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
                                                           team_size);
+#endif
 
     typename Functor::value_type error_count = 0;
 
@@ -559,7 +610,11 @@ struct TestLambdaSharedTeam {
         Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
 
     const int SHARED_COUNT = 1000;
-    int team_size          = 1;
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    int team_size = 32;
+#else
+    int team_size = 1;
+#endif
 
 #ifdef KOKKOS_ENABLE_CUDA
     if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128;
@@ -583,8 +638,9 @@ struct TestLambdaSharedTeam {
 
           if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
               (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
-            printf("Failed to allocate shared memory of size %lu\n",
-                   static_cast<unsigned long>(SHARED_COUNT));
+            KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+                "Failed to allocate shared memory of size %lu\n",
+                static_cast<unsigned long>(SHARED_COUNT));
 
             ++update;  // Failure to allocate is an error.
           } else {
@@ -650,8 +706,9 @@ struct ScratchTeamFunctor {
     if ((scratch_ptr.data() == nullptr) ||
         (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
         (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
-      printf("Failed to allocate shared memory of size %lu\n",
-             static_cast<unsigned long>(SHARED_TEAM_COUNT));
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "Failed to allocate shared memory of size %lu\n",
+          static_cast<unsigned long>(SHARED_TEAM_COUNT));
 
       ++update;  // Failure to allocate is an error.
     } else {
@@ -713,11 +770,19 @@ struct TestScratchTeam {
     int thread_scratch_size = Functor::shared_int_array_type::shmem_size(
         Functor::SHARED_THREAD_COUNT);
 
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    p_type team_exec = p_type(64, 32).set_scratch_size(
+        1,
+        Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
+            Functor::SHARED_TEAM_COUNT)),
+        Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
+#else
     p_type team_exec = p_type(8192, 1).set_scratch_size(
         1,
         Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
             Functor::SHARED_TEAM_COUNT)),
         Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
+#endif
 
     const size_t team_size =
         team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag());
@@ -726,7 +791,11 @@ struct TestScratchTeam {
         Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
         Functor::shared_int_array_type::shmem_size(3 * team_size);
 
-    team_exec = p_type(8192 / team_size, team_size);
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    team_exec = p_type(64 / team_size, team_size);
+#else
+    team_exec          = p_type(8192 / team_size, team_size);
+#endif
 
     Kokkos::parallel_reduce(
         team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
@@ -781,16 +850,17 @@ KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
                        });
   team.team_barrier();
 
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) {
-    a_thread1(i) = 1000000 + 100000 * team.team_rank() + 16 - i +
-                   team.league_rank() * 100000;
-    a_thread2(i) = 2000000 + 100000 * team.team_rank() + 16 - i +
-                   team.league_rank() * 100000;
-    a_thread3(i) = 3000000 + 100000 * team.team_rank() + 16 - i +
-                   team.league_rank() * 100000;
-  });
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)),
+                       [&](const int &i) {
+                         a_thread1(i) = 1000000 + 100000 * team.team_rank() +
+                                        16 - i + team.league_rank() * 100000;
+                         a_thread2(i) = 2000000 + 100000 * team.team_rank() +
+                                        16 - i + team.league_rank() * 100000;
+                         a_thread3(i) = 3000000 + 100000 * team.team_rank() +
+                                        16 - i + team.league_rank() * 100000;
+                       });
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, 12800),
+  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)),
                        [&](const int &i) {
                          b_team1(i) = 1000000 + i + team.league_rank() * 100000;
                          b_team2(i) = 2000000 + i + team.league_rank() * 100000;
@@ -1218,8 +1288,16 @@ struct TestTeamBroadcast<
     using policy_type_f =
         Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
 
+    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    int fake_team_size =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
+                                                                           : 1;
+#else
+    int fake_team_size = 1;
+#endif
     const int team_size =
-        policy_type_f(league_size, 1)
+        policy_type_f(league_size, fake_team_size)
             .team_size_max(
                 functor,
                 Kokkos::
@@ -1364,13 +1442,20 @@ struct TestTeamBroadcast<
     using policy_type_f =
         Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
 
+    // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    int fake_team_size =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
+                                                                           : 1;
+#else
+    int fake_team_size = 1;
+#endif
     const int team_size =
-        policy_type_f(league_size, 1)
+        policy_type_f(league_size, fake_team_size)
             .team_size_max(
                 functor,
                 Kokkos::
                     ParallelReduceTag());  // printf("team_size=%d\n",team_size);
-
     // team_broadcast with value
     value_type total = 0;
 
@@ -1422,10 +1507,15 @@ struct TestScratchAlignment {
       Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
   void test(bool allocate_small) {
     int shmem_size = ScratchView::shmem_size(11);
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    int team_size = 32;
+#else
+    int team_size      = 1;
+#endif
     if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
     Kokkos::parallel_for(
-        Kokkos::TeamPolicy<ExecSpace>(1, 1).set_scratch_size(
-            0, Kokkos::PerTeam(shmem_size)),
+        Kokkos::TeamPolicy<ExecSpace>(1, team_size)
+            .set_scratch_size(0, Kokkos::PerTeam(shmem_size)),
         KOKKOS_LAMBDA(
             const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
           if (allocate_small) ScratchViewInt p(team.team_scratch(0), 1);
@@ -1439,6 +1529,38 @@ struct TestScratchAlignment {
 
 }  // namespace
 
+namespace {
+
+template <class ExecSpace>
+struct TestTeamPolicyHandleByValue {
+  using scalar     = double;
+  using exec_space = ExecSpace;
+  using mem_space  = typename ExecSpace::memory_space;
+
+  TestTeamPolicyHandleByValue() { test(); }
+
+  void test() {
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+    const int M = 1, N = 1;
+    Kokkos::View<scalar **, mem_space> a("a", M, N);
+    Kokkos::View<scalar **, mem_space> b("b", M, N);
+    Kokkos::deep_copy(a, 0.0);
+    Kokkos::deep_copy(b, 1.0);
+    Kokkos::parallel_for(
+        "test_tphandle_by_value",
+        Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1),
+        KOKKOS_LAMBDA(
+            const typename Kokkos::TeamPolicy<exec_space>::member_type team) {
+          const int i = team.league_rank();
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N),
+                               [&](const int j) { a(i, j) += b(i, j); });
+        });
+#endif
+  }
+};
+
+}  // namespace
+
 }  // namespace Test
 
 /*--------------------------------------------------------------------------*/
diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
index 1700a74124d377b17944203cad01a8a31c6c16cd..87c010ac2a0c5701916049532a715c6a5addce15 100644
--- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -65,6 +65,8 @@ TEST(TEST_CATEGORY, team_for) {
       1000);
 }
 
+// FIXME_OPENMPTARGET wrong results
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, team_reduce) {
   TestTeamPolicy<TEST_EXECSPACE,
                  Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
@@ -79,42 +81,31 @@ TEST(TEST_CATEGORY, team_reduce) {
   TestTeamPolicy<TEST_EXECSPACE,
                  Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
 }
-
-TEST(TEST_CATEGORY, team_broadcast_long) {
-  // FIXME_OPENMPTARGET
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  if constexpr (!std::is_same<TEST_EXECSPACE,
-                              Kokkos::Experimental::OpenMPTarget>::value)
 #endif
-  {
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
-                      long>::test_teambroadcast(0, 1);
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                      long>::test_teambroadcast(0, 1);
-
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
-                      long>::test_teambroadcast(2, 1);
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                      long>::test_teambroadcast(2, 1);
-
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
-                      long>::test_teambroadcast(16, 1);
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                      long>::test_teambroadcast(16, 1);
 
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
-                      long>::test_teambroadcast(1000, 1);
-    TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
-                      long>::test_teambroadcast(1000, 1);
-  }
+TEST(TEST_CATEGORY, team_broadcast_long) {
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long>::test_teambroadcast(0, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long>::test_teambroadcast(0, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long>::test_teambroadcast(2, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long>::test_teambroadcast(16, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long>::test_teambroadcast(16, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long>::test_teambroadcast(1000, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long>::test_teambroadcast(1000, 1);
 }
 
 TEST(TEST_CATEGORY, team_broadcast_char) {
-  // FIXME_OPENMPTARGET
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  if constexpr (!std::is_same<TEST_EXECSPACE,
-                              Kokkos::Experimental::OpenMPTarget>::value)
-#endif
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
                       unsigned char>::test_teambroadcast(0, 1);
@@ -139,11 +130,6 @@ TEST(TEST_CATEGORY, team_broadcast_char) {
 }
 
 TEST(TEST_CATEGORY, team_broadcast_float) {
-  // FIXME_OPENMPTARGET
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  if constexpr (!std::is_same<TEST_EXECSPACE,
-                              Kokkos::Experimental::OpenMPTarget>::value)
-#endif
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
                       float>::test_teambroadcast(0, 1.3);
@@ -178,11 +164,6 @@ TEST(TEST_CATEGORY, team_broadcast_float) {
 }
 
 TEST(TEST_CATEGORY, team_broadcast_double) {
-  // FIXME_OPENMPTARGET
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  if constexpr (!std::is_same<TEST_EXECSPACE,
-                              Kokkos::Experimental::OpenMPTarget>::value)
-#endif
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
                       double>::test_teambroadcast(0, 1.3);
@@ -217,6 +198,10 @@ TEST(TEST_CATEGORY, team_broadcast_double) {
   }
 }
 
+TEST(TEST_CATEGORY, team_handle_by_value) {
+  { TestTeamPolicyHandleByValue<TEST_EXECSPACE>(); }
+}
+
 }  // namespace Test
 
 #ifndef KOKKOS_ENABLE_OPENMPTARGET
diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
index fd0f052b765253b96fd283861314d68d6f2d81a4..75ca3587629ded5f5cc2dd2f3b8ef6623e8a07f7 100644
--- a/packages/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -80,6 +80,10 @@ TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); }
 TEST(TEST_CATEGORY, multi_level_scratch) {
   // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a
   // team size larger than 256. Fixed In ROCm 3.9
+  // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and
+  // L1 combined per team. Currently OpenMPTarget cannot allocate this high
+  // amount of scratch memory.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
 #if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
   if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
 #endif
@@ -89,6 +93,7 @@ TEST(TEST_CATEGORY, multi_level_scratch) {
     TestMultiLevelScratchTeam<TEST_EXECSPACE,
                               Kokkos::Schedule<Kokkos::Dynamic> >();
   }
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 68f99fa3a0f99a376ff355a4ac9fb5b4f3fefd84..992e80397bacb9b5dc9a0746ca2543a1792cce22 100644
--- a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -145,8 +145,14 @@ template <class T, int N, class PolicyType>
 void test_team_policy_max_recommended(int scratch_size) {
   test_team_policy_max_recommended_static_size<T, N, PolicyType, 1>(
       scratch_size);
+  // FIXME_SYCL prevent running out of total kernel argument size limit
+#ifdef KOKKOS_ENABLE_SYCL
+  test_team_policy_max_recommended_static_size<T, N, PolicyType, 100>(
+      scratch_size);
+#else
   test_team_policy_max_recommended_static_size<T, N, PolicyType, 1000>(
       scratch_size);
+#endif
 }
 
 TEST(TEST_CATEGORY, team_policy_max_recommended) {
@@ -186,7 +192,8 @@ template <typename TeamHandleType, typename ReducerValueType>
 struct PrintFunctor1 {
   KOKKOS_INLINE_FUNCTION void operator()(const TeamHandleType& team,
                                          ReducerValueType&) const {
-    printf("Test %i %i\n", int(team.league_rank()), int(team.team_rank()));
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF("Test %i %i\n", int(team.league_rank()),
+                                  int(team.team_rank()));
   }
 };
 
@@ -194,7 +201,8 @@ template <typename TeamHandleType, typename ReducerValueType>
 struct PrintFunctor2 {
   KOKKOS_INLINE_FUNCTION void operator()(const TeamHandleType& team,
                                          ReducerValueType& teamVal) const {
-    printf("Test %i %i\n", int(team.league_rank()), int(team.team_rank()));
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF("Test %i %i\n", int(team.league_rank()),
+                                  int(team.team_rank()));
     teamVal += 1;
   }
 };
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
index c2f47c9ec74e3c81e37a824758c3e292c0d58322..ba11dc07a962989f2826a3d0def3649112c00da6 100644
--- a/packages/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -76,9 +76,11 @@ struct functor_team_for {
     const size_type shmemSize = team.team_size() * 13;
     shared_int values         = shared_int(team.team_shmem(), shmemSize);
 
-    if (values.data() == nullptr || values.extent(0) < shmemSize) {
-      printf("FAILED to allocate shared memory of size %u\n",
-             static_cast<unsigned int>(shmemSize));
+    if (values.data() == nullptr ||
+        static_cast<size_type>(values.extent(0)) < shmemSize) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "FAILED to allocate shared memory of size %u\n",
+          static_cast<unsigned int>(shmemSize));
     } else {
       // Initialize shared memory.
       values(team.team_rank()) = 0;
@@ -108,9 +110,10 @@ struct functor_team_for {
         }
 
         if (test != value) {
-          printf("FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
-                 team.team_rank(), static_cast<double>(test),
-                 static_cast<double>(value));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
+              team.team_rank(), static_cast<double>(test),
+              static_cast<double>(value));
           flag() = 1;
         }
       });
@@ -166,17 +169,18 @@ struct functor_team_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          printf("FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
-                 team.league_rank(), team.team_rank(),
-                 static_cast<double>(test), static_cast<double>(value),
-                 static_cast<unsigned long>(sizeof(Scalar)));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
+              team.league_rank(), team.team_rank(), static_cast<double>(test),
+              static_cast<double>(value),
+              static_cast<unsigned long>(sizeof(Scalar)));
         }
 
         flag() = 1;
       }
       if (test != shared_value(0)) {
         if (team.league_rank() == 0) {
-          printf(
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
               "FAILED team_parallel_reduce with shared result %i %i %lf %lf "
               "%lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -237,14 +241,15 @@ struct functor_team_reduce_reducer {
       }
 
       if (test != value) {
-        printf("FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
-               team.league_rank(), team.team_rank(), static_cast<double>(test),
-               static_cast<double>(value));
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
+            team.league_rank(), team.team_rank(), static_cast<double>(test),
+            static_cast<double>(value));
 
         flag() = 1;
       }
       if (test != shared_value(0)) {
-        printf(
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "FAILED team_vector_parallel_reduce_reducer shared value %i %i %lf "
             "%lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -281,9 +286,11 @@ struct functor_team_vector_for {
     const size_type shmemSize = team.team_size() * 13;
     shared_int values         = shared_int(team.team_shmem(), shmemSize);
 
-    if (values.data() == nullptr || values.extent(0) < shmemSize) {
-      printf("FAILED to allocate shared memory of size %u\n",
-             static_cast<unsigned int>(shmemSize));
+    if (values.data() == nullptr ||
+        static_cast<size_type>(values.extent(0)) < shmemSize) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "FAILED to allocate shared memory of size %u\n",
+          static_cast<unsigned int>(shmemSize));
     } else {
       team.team_barrier();
 
@@ -313,9 +320,10 @@ struct functor_team_vector_for {
         }
 
         if (test != value) {
-          printf("FAILED team_vector_parallel_for %i %i %f %f\n",
-                 team.league_rank(), team.team_rank(),
-                 static_cast<double>(test), static_cast<double>(value));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED team_vector_parallel_for %i %i %f %f\n",
+              team.league_rank(), team.team_rank(), static_cast<double>(test),
+              static_cast<double>(value));
 
           flag() = 1;
         }
@@ -363,10 +371,11 @@ struct functor_team_vector_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          printf("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
-                 team.league_rank(), team.team_rank(),
-                 static_cast<double>(test), static_cast<double>(value),
-                 static_cast<unsigned long>(sizeof(Scalar)));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+              team.league_rank(), team.team_rank(), static_cast<double>(test),
+              static_cast<double>(value),
+              static_cast<unsigned long>(sizeof(Scalar)));
         }
 
         flag() = 1;
@@ -414,9 +423,10 @@ struct functor_team_vector_reduce_reducer {
       }
 
       if (test != value) {
-        printf("FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
-               team.league_rank(), team.team_rank(), static_cast<double>(test),
-               static_cast<double>(value));
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+            team.league_rank(), team.team_rank(), static_cast<double>(test),
+            static_cast<double>(value));
 
         flag() = 1;
       }
@@ -460,8 +470,9 @@ struct functor_vec_single {
         [&](int /*i*/, Scalar &val) { val += value; }, value2);
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
-      printf("FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
-             team.team_rank(), (double)value2, (double)value);
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
+          team.team_rank(), (double)value2, (double)value);
 
       flag() = 1;
     }
@@ -491,8 +502,8 @@ struct functor_vec_for {
 
     if (values.data() == nullptr ||
         values.extent(0) < (unsigned)team.team_size() * 13) {
-      printf("FAILED to allocate memory of size %i\n",
-             static_cast<int>(team.team_size() * 13));
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED to allocate memory of size %i\n",
+                                    static_cast<int>(team.team_size() * 13));
       flag() = 1;
     } else {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 13), [&](int i) {
@@ -512,9 +523,10 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          printf("FAILED vector_par_for %i %i %f %f\n", team.league_rank(),
-                 team.team_rank(), static_cast<double>(test),
-                 static_cast<double>(value));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
+                                        team.league_rank(), team.team_rank(),
+                                        static_cast<double>(test),
+                                        static_cast<double>(value));
 
           flag() = 1;
         }
@@ -548,8 +560,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        printf("FAILED vector_par_reduce %i %i %f %f\n", team.league_rank(),
-               team.team_rank(), (double)test, (double)value);
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
+                                      team.league_rank(), team.team_rank(),
+                                      (double)test, (double)value);
 
         flag() = 1;
       }
@@ -586,9 +599,9 @@ struct functor_vec_red_reducer {
       for (int i = 0; i < 13; i++) test *= (i % 5 + 1);
 
       if (test != value) {
-        printf("FAILED vector_par_reduce_reducer %i %i %f %f\n",
-               team.league_rank(), team.team_rank(), (double)test,
-               (double)value);
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+            team.league_rank(), team.team_rank(), (double)test, (double)value);
 
         flag() = 1;
       }
@@ -616,9 +629,10 @@ struct functor_vec_scan {
                               for (int k = 0; k <= i; k++) test += k;
 
                               if (test != val) {
-                                printf("FAILED vector_par_scan %i %i %f %f\n",
-                                       team.league_rank(), team.team_rank(),
-                                       (double)test, (double)val);
+                                KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+                                    "FAILED vector_par_scan %i %i %f %f\n",
+                                    team.league_rank(), team.team_rank(),
+                                    (double)test, (double)val);
 
                                 flag() = 1;
                               }
@@ -850,6 +864,120 @@ class TestTripleNestedReduce {
 
 #endif
 
+namespace VectorScanReducer {
+enum class ScanType : bool { Inclusive, Exclusive };
+
+template <typename ExecutionSpace, ScanType scan_type, int n,
+          int n_vector_range, class Reducer>
+struct checkScan {
+  const int n_team_thread_range = 1000;
+  const int n_per_team          = n_team_thread_range * n_vector_range;
+
+  using size_type  = typename ExecutionSpace::size_type;
+  using value_type = typename Reducer::value_type;
+  using view_type  = Kokkos::View<value_type[n], ExecutionSpace>;
+
+  view_type inputs  = view_type{"inputs"};
+  view_type outputs = view_type{"outputs"};
+
+  value_type result;
+  Reducer reducer = {result};
+
+  struct ThreadVectorFunctor {
+    KOKKOS_FUNCTION void operator()(const size_type j, value_type &update,
+                                    const bool final) const {
+      const size_type element = j + m_team_offset + m_thread_offset;
+      const auto tmp          = m_inputs(element);
+      if (scan_type == ScanType::Inclusive) {
+        m_reducer.join(update, tmp);
+        if (final) {
+          m_outputs(element) = update;
+        }
+      } else {
+        if (final) {
+          m_outputs(element) = update;
+        }
+        m_reducer.join(update, tmp);
+      }
+    }
+
+    const Reducer &m_reducer;
+    const size_type &m_team_offset;
+    const size_type &m_thread_offset;
+    const view_type &m_outputs;
+    const view_type &m_inputs;
+  };
+
+  struct TeamThreadRangeFunctor {
+    KOKKOS_FUNCTION void operator()(const size_type i) const {
+      const size_type thread_offset = i * n_vector_range;
+      Kokkos::parallel_scan(
+          Kokkos::ThreadVectorRange(m_team, n_vector_range),
+          ThreadVectorFunctor{m_reducer, m_team_offset, thread_offset,
+                              m_outputs, m_inputs},
+          m_reducer);
+    }
+
+    const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &m_team;
+    const Reducer &m_reducer;
+    const size_type &m_team_offset;
+    const view_type &m_outputs;
+    const view_type &m_inputs;
+  };
+
+  KOKKOS_FUNCTION void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
+      const {
+    const size_type iTeam       = team.league_rank();
+    const size_type iTeamOffset = iTeam * n_per_team;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, n_team_thread_range),
+        TeamThreadRangeFunctor{team, reducer, iTeamOffset, outputs, inputs});
+  }
+
+  KOKKOS_FUNCTION void operator()(size_type i) const { inputs(i) = i * 1. / n; }
+
+  void run() {
+    const int n_teams = n / n_per_team;
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, n), *this);
+
+    // run ThreadVectorRange parallel_scan
+    Kokkos::TeamPolicy<ExecutionSpace> policy(n_teams, Kokkos::AUTO,
+                                              Kokkos::AUTO);
+    const std::string label =
+        (scan_type == ScanType::Inclusive ? std::string("inclusive")
+                                          : std::string("exclusive")) +
+        "Scan" + typeid(Reducer).name();
+    Kokkos::parallel_for(label, policy, *this);
+    Kokkos::fence();
+
+    auto host_outputs =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, outputs);
+    auto host_inputs =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, inputs);
+
+    Kokkos::View<value_type[n], Kokkos::HostSpace> expected("expected");
+    {
+      value_type identity;
+      reducer.init(identity);
+      for (int i = 0; i < expected.extent_int(0); ++i) {
+        const int vector       = i % n_vector_range;
+        const value_type accum = vector == 0 ? identity : expected(i - 1);
+        const value_type val =
+            scan_type == ScanType::Inclusive
+                ? host_inputs(i)
+                : (vector == 0 ? identity : host_inputs(i - 1));
+        expected(i) = accum;
+        reducer.join(expected(i), val);
+      }
+    }
+    for (int i = 0; i < host_outputs.extent_int(0); ++i)
+      ASSERT_EQ(host_outputs(i), expected(i));
+  }
+};
+}  // namespace VectorScanReducer
+
 #if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP))
 TEST(TEST_CATEGORY, team_vector) {
   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0)));
@@ -887,4 +1015,33 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
 }
 #endif
 
+TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
+  using T = double;
+  using namespace VectorScanReducer;
+
+  static constexpr int n              = 1000000;
+  static constexpr int n_vector_range = 100;
+
+  checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
+            Kokkos::Prod<T, TEST_EXECSPACE>>()
+      .run();
+  checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
+            Kokkos::Prod<T, TEST_EXECSPACE>>()
+      .run();
+
+  checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
+            Kokkos::Max<T, TEST_EXECSPACE>>()
+      .run();
+  checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
+            Kokkos::Max<T, TEST_EXECSPACE>>()
+      .run();
+
+  checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
+            Kokkos::Min<T, TEST_EXECSPACE>>()
+      .run();
+  checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
+            Kokkos::Min<T, TEST_EXECSPACE>>()
+      .run();
+}
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
index 1b64fef0507ac45863ce62e8d2f375a877035891..7342ebad8433526719b52058ff6d6b75e41a107a 100644
--- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -169,17 +169,17 @@ struct my_complex {
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator==(const my_complex& src) {
+  bool operator==(const my_complex& src) const {
     return (re == src.re) && (im == src.im) && (dummy == src.dummy);
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const my_complex& src) {
+  bool operator!=(const my_complex& src) const {
     return (re != src.re) || (im != src.im) || (dummy != src.dummy);
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator!=(const double& val) {
+  bool operator!=(const double& val) const {
     return (re != val) || (im != 0) || (dummy != 0);
   }
 
@@ -244,8 +244,9 @@ struct functor_teamvector_for {
     shared_int values         = shared_int(team.team_shmem(), shmemSize);
 
     if (values.data() == nullptr || values.extent(0) < shmemSize) {
-      printf("FAILED to allocate shared memory of size %u\n",
-             static_cast<unsigned int>(shmemSize));
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "FAILED to allocate shared memory of size %u\n",
+          static_cast<unsigned int>(shmemSize));
     } else {
       // Initialize shared memory.
       Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131),
@@ -278,9 +279,10 @@ struct functor_teamvector_for {
         }
 
         if (test != value) {
-          printf("FAILED teamvector_parallel_for %i %i %f %f\n",
-                 team.league_rank(), team.team_rank(),
-                 static_cast<double>(test), static_cast<double>(value));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED teamvector_parallel_for %i %i %f %f\n",
+              team.league_rank(), team.team_rank(), static_cast<double>(test),
+              static_cast<double>(value));
           flag() = 1;
         }
       });
@@ -344,17 +346,18 @@ struct functor_teamvector_reduce {
 
       if (test != value) {
         if (team.league_rank() == 0) {
-          printf("FAILED teamvector_parallel_reduce %i %i %lf %lf %lu\n",
-                 (int)team.league_rank(), (int)team.team_rank(),
-                 static_cast<double>(test), static_cast<double>(value),
-                 static_cast<unsigned long>(sizeof(Scalar)));
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+              "FAILED teamvector_parallel_reduce %i %i %lf %lf %lu\n",
+              (int)team.league_rank(), (int)team.team_rank(),
+              static_cast<double>(test), static_cast<double>(value),
+              static_cast<unsigned long>(sizeof(Scalar)));
         }
 
         flag() = 1;
       }
       if (test != shared_value(0)) {
         if (team.league_rank() == 0) {
-          printf(
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF(
               "FAILED teamvector_parallel_reduce with shared result %i %i %lf "
               "%lf %lu\n",
               static_cast<int>(team.league_rank()),
@@ -416,14 +419,15 @@ struct functor_teamvector_reduce_reducer {
       }
 
       if (test != value) {
-        printf("FAILED teamvector_parallel_reduce_reducer %i %i %lf %lf\n",
-               team.league_rank(), team.team_rank(), static_cast<double>(test),
-               static_cast<double>(value));
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED teamvector_parallel_reduce_reducer %i %i %lf %lf\n",
+            team.league_rank(), team.team_rank(), static_cast<double>(test),
+            static_cast<double>(value));
 
         flag() = 1;
       }
       if (test != shared_value(0)) {
-        printf(
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "FAILED teamvector_parallel_reduce_reducer shared value %i %i %lf "
             "%lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
@@ -443,15 +447,35 @@ bool test_scalar(int nteams, int team_size, int test) {
   h_flag() = 0;
   Kokkos::deep_copy(d_flag, h_flag);
 
+  Kokkos::TeamPolicy<ExecutionSpace> policy(nteams, team_size, 8);
+
+  // FIXME_OPENMPTARGET - Need to allocate scratch space via set_scratch_space
+  // for the OPENMPTARGET backend.
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  using scratch_t = Kokkos::View<Scalar*, ExecutionSpace,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+
+  int scratch_size = 0;
+  if (test == 0) {
+    scratch_size = scratch_t::shmem_size(131);
+  } else {
+    // FIXME_OPENMPTARGET - Currently allocating more than one team for nested
+    // reduction leads to runtime errors of illegal memory access, caused mostly
+    // due to the OpenMP memory allocation constraints.
+    policy       = Kokkos::TeamPolicy<ExecutionSpace>(1, team_size, 8);
+    scratch_size = scratch_t::shmem_size(1);
+  }
+
+  policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+#endif
+
   if (test == 0) {
     Kokkos::parallel_for(
-        "Test::TeamVectorFor",
-        Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
+        "Test::TeamVectorFor", policy,
         functor_teamvector_for<Scalar, ExecutionSpace>(d_flag));
   } else if (test == 1) {
     Kokkos::parallel_for(
-        "Test::TeamVectorReduce",
-        Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
+        "Test::TeamVectorReduce", policy,
         functor_teamvector_reduce<Scalar, ExecutionSpace>(d_flag));
   } else if (test == 2) {
     Kokkos::parallel_for(
@@ -477,8 +501,12 @@ bool Test(int test) {
            test_scalar<long long int, ExecutionSpace>(317, team_size, test);
   passed = passed && test_scalar<float, ExecutionSpace>(317, team_size, test);
   passed = passed && test_scalar<double, ExecutionSpace>(317, team_size, test);
+  // FIXME_OPENMPTARGET - Use of custom reducers currently results in runtime
+  // memory errors.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   passed =
       passed && test_scalar<my_complex, ExecutionSpace>(317, team_size, test);
+#endif
 
   return passed;
 }
@@ -490,6 +518,10 @@ namespace Test {
 TEST(TEST_CATEGORY, team_teamvector_range) {
   ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(0)));
   ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(1)));
+  // FIXME_OPENMPTARGET - Use of kokkos reducers currently results in runtime
+  // memory errors.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(2)));
+#endif
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestUniqueToken.hpp b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
index d78c35c6815d07bbbe37d83b0798955e07f9ce4a..4ba48bf73f069c6097a079ce1bcde5fd9452155c 100644
--- a/packages/kokkos/core/unit_test/TestUniqueToken.hpp
+++ b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
@@ -132,6 +132,8 @@ class TestUniqueToken {
       }
     }
 
+    // FIXME_SYCL wrong result on NVIDIA GPUs but correct on host and Intel GPUs
+#ifndef KOKKOS_ENABLE_SYCL
     // Count test for pull request #3260
     {
       constexpr int N = 1000000;
@@ -148,6 +150,7 @@ class TestUniqueToken {
           self, sum);
       ASSERT_EQ(sum, int64_t(N) * R);
     }
+#endif
 
     std::cout << "TestUniqueToken max reuse = " << max << std::endl;
 
@@ -233,7 +236,13 @@ class TestAcquireTeamUniqueToken {
 
     {
       const int duplicate = 100;
-      const long n        = duplicate * self.tokens.size();
+      // FIXME_SYCL The number of workgroups on CUDA devices can not be larger
+      // than 65535
+#ifdef KOKKOS_ENABLE_SYCL
+      const long n = std::min(65535, duplicate * self.tokens.size());
+#else
+      const long n = duplicate * self.tokens.size();
+#endif
 
       team_policy_type team_policy(n, team_size);
       team_policy.set_scratch_size(
@@ -271,7 +280,10 @@ class TestAcquireTeamUniqueToken {
 };
 
 TEST(TEST_CATEGORY, acquire_team_unique_token) {
+  // FIXME_OPENMPTARGET - Not yet implemented.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   TestAcquireTeamUniqueToken<TEST_EXECSPACE>::run();
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp
index c9352c0d7fa25a5dda7fc8910609b7158e251f67..1d3e19da105161e0b71c733ad2bb1232add1d8aa 100644
--- a/packages/kokkos/core/unit_test/TestUtilities.hpp
+++ b/packages/kokkos/core/unit_test/TestUtilities.hpp
@@ -70,307 +70,6 @@ void test_is_specialization_of() {
                 "");
 }
 
-inline void test_utilities() {
-  using namespace Kokkos::Impl;
-
-  {
-    using i = integer_sequence<int>;
-    using j = make_integer_sequence<int, 0>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 0u, "Error: integer_sequence.size()");
-  }
-
-  {
-    using i = integer_sequence<int, 0>;
-    using j = make_integer_sequence<int, 1>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 1u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1>;
-    using j = make_integer_sequence<int, 2>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 2u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2>;
-    using j = make_integer_sequence<int, 3>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 3u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3>;
-    using j = make_integer_sequence<int, 4>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 4u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4>;
-    using j = make_integer_sequence<int, 5>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 5u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4, 5>;
-    using j = make_integer_sequence<int, 6>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 6u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<5, i>::value == 5,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4, 5, 6>;
-    using j = make_integer_sequence<int, 7>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 7u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<5, i>::value == 5,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<6, i>::value == 6,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4, 5, 6, 7>;
-    using j = make_integer_sequence<int, 8>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 8u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<5, i>::value == 5,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<6, i>::value == 6,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<7, i>::value == 7,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4, 5, 6, 7, 8>;
-    using j = make_integer_sequence<int, 9>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 9u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<5, i>::value == 5,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<6, i>::value == 6,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<7, i>::value == 7,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<8, i>::value == 8,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i = integer_sequence<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>;
-    using j = make_integer_sequence<int, 10>;
-
-    static_assert(std::is_same<i, j>::value, "Error: make_integer_sequence");
-    static_assert(i::size() == 10u, "Error: integer_sequence.size()");
-
-    static_assert(integer_sequence_at<0, i>::value == 0,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<1, i>::value == 1,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<2, i>::value == 2,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<3, i>::value == 3,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<4, i>::value == 4,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<5, i>::value == 5,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<6, i>::value == 6,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<7, i>::value == 7,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<8, i>::value == 8,
-                  "Error: integer_sequence_at");
-    static_assert(integer_sequence_at<9, i>::value == 9,
-                  "Error: integer_sequence_at");
-
-    static_assert(at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)");
-    static_assert(at(9, i{}) == 9, "Error: at(unsigned, integer_sequence)");
-  }
-
-  {
-    using i  = make_integer_sequence<int, 5>;
-    using r  = reverse_integer_sequence<i>;
-    using gr = integer_sequence<int, 4, 3, 2, 1, 0>;
-
-    static_assert(std::is_same<r, gr>::value,
-                  "Error: reverse_integer_sequence");
-  }
-
-  {
-    using s = make_integer_sequence<int, 10>;
-    using e = exclusive_scan_integer_sequence<s>;
-    using i = inclusive_scan_integer_sequence<s>;
-
-    using ge = integer_sequence<int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36>;
-    using gi = integer_sequence<int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45>;
-
-    static_assert(e::value == 45, "Error: scan value");
-    static_assert(i::value == 45, "Error: scan value");
-
-    static_assert(std::is_same<e::type, ge>::value, "Error: exclusive_scan");
-    static_assert(std::is_same<i::type, gi>::value, "Error: inclusive_scan");
-  }
-}
-
 template <std::size_t... Idxs, class... Args>
 std::size_t do_comma_emulation_test(std::integer_sequence<std::size_t, Idxs...>,
                                     Args... args) {
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index e85942a8de54a8068937c6ba9280a8c045e2902e..570281f9fd66a230e69b9bb924a84a0078e12168 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1145,8 +1145,6 @@ class TestViewAPI {
     // T v2 = hx( 0, 0 ); // Generates compile error as intended.
     // hx( 0, 0 ) = v2;   // Generates compile error as intended.
 
-    // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
     // Testing with asynchronous deep copy with respect to device
     {
       size_t count = 0;
@@ -1249,7 +1247,6 @@ class TestViewAPI {
               ASSERT_EQ(hx(ip, i1, i2, i3), T(0));
             }
     }
-#endif
 
     dz = dx;
     ASSERT_EQ(dx, dz);
@@ -1481,12 +1478,6 @@ class TestViewAPI {
     if (std::is_same<typename dView1::memory_space,
                      Kokkos::Experimental::OpenMPTargetSpace>::value)
       return;
-#endif
-// FIXME_SYCL
-#ifdef KOKKOS_ENABLE_SYCL
-    if (std::is_same<typename dView1::memory_space,
-                     Kokkos::Experimental::SYCLDeviceUSMSpace>::value)
-      return;
 #endif
     auto alloc_size = std::numeric_limits<size_t>::max() - 42;
     try {
@@ -1504,10 +1495,21 @@ class TestViewAPI {
       // quickly.
       if (msg.find("is not a valid size") != std::string::npos) {
         ASSERT_PRED_FORMAT2(::testing::IsSubstring, "is not a valid size", msg);
-      } else {
-        // Otherwise, there has to be some sort of "insufficient memory" error
+      } else
+#ifdef KOKKOS_ENABLE_SYCL
+          if (msg.find("insufficient memory") != std::string::npos)
+#endif
+      {
         ASSERT_PRED_FORMAT2(::testing::IsSubstring, "insufficient memory", msg);
       }
+      // SYCL cannot tell the reason why a memory allocation failed
+#ifdef KOKKOS_ENABLE_SYCL
+      else {
+        // Otherwise, there has to be some sort of "unknown error" error
+        ASSERT_PRED_FORMAT2(::testing::IsSubstring,
+                            "because of an unknown error.", msg);
+      }
+#endif
     }
   }
 };
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_c.hpp b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp
index 0cfe1b8c0eb5190fbcbae44777dd398319beb900..a70792dc623b63bb8aa1a84fec93ca413ffa94a1 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_c.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp
@@ -47,10 +47,7 @@
 namespace Test {
 
 TEST(TEST_CATEGORY, view_api_c) {
-  // FIXME_SYCL requires deep_copy on the default memory space
-#ifndef KOKKOS_ENABLE_SYCL
   TestViewAPI<double, TEST_EXECSPACE>::run_test_deep_copy_empty();
-#endif
   TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_b();
 }
 
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index cb586c76a700e1acad69c185e13f496c61a9f6c2..a5dc6cf29a467bd576bd96bca52f90b3db26324b 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -98,8 +98,6 @@ TEST(TEST_CATEGORY, view_remap) {
 
   Kokkos::fence();
   // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
-  // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
   Kokkos::deep_copy(output, input);
   Kokkos::fence();
 
@@ -112,7 +110,6 @@ TEST(TEST_CATEGORY, view_remap) {
           ++value;
           ASSERT_EQ(value, ((int)output(i0, i1, i2, i3)));
         }
-#endif
 }
 
 TEST(TEST_CATEGORY, view_mirror_nonconst) {
diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
index f0b5b8ff9f9ce750fe2476037c67de1a3541578f..e25cb9e39ca6fd4c3cd45ef2b60b404ed82c03e7 100644
--- a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
@@ -104,8 +104,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
       typename TEST_EXECSPACE::memory_space>::accessible;
 
   // Contiguous copies
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   { Kokkos::deep_copy(defaulted, defaulted); }
   {
     Kokkos::deep_copy(a, 1);
@@ -151,7 +149,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
     Kokkos::deep_copy(b, h_b);
     ASSERT_TRUE(run_check(b, 4));
   }
-#endif
   // Non contiguous copies
   {
     Kokkos::deep_copy(s_a, 5);
@@ -180,8 +177,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
     }
   }
 
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   // Contiguous copies
   { Kokkos::deep_copy(dev, defaulted, defaulted); }
   {
@@ -228,9 +223,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
     Kokkos::deep_copy(dev, b, h_b);
     ASSERT_TRUE(run_check(b, 4));
   }
-#endif
-
-  // WORKS if commenting out below stuff
   // Non contiguous copies
   {
     Kokkos::deep_copy(dev, s_a, 5);
@@ -259,8 +251,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
     }
   }
 
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   // Contiguous copies
   { Kokkos::deep_copy(host, defaulted, defaulted); }
   {
@@ -307,7 +297,6 @@ TEST(TEST_CATEGORY, view_copy_tests) {
     Kokkos::deep_copy(host, b, h_b);
     ASSERT_TRUE(run_check(b, 4));
   }
-#endif
   // Non contiguous copies
   {
     Kokkos::deep_copy(host, s_a, 5);
diff --git a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
index 31108af38e0c8ceed163cee5db77220fdbe855f5..9ce3a34236956572b5a63c38765c05564a536140 100644
--- a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
+++ b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
@@ -95,8 +95,6 @@ TEST(TEST_CATEGORY, view_layoutstride_left_to_layoutleft_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   {  // Assignment of rank-2 LayoutLeft = LayoutStride
     int ndims   = 2;
     int dims[]  = {10, 9};
@@ -335,7 +333,6 @@ TEST(TEST_CATEGORY, view_layoutstride_left_to_layoutleft_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-#endif
 }
 
 TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
@@ -380,8 +377,6 @@ TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   {  // Assignment of rank-2 LayoutRight = LayoutStride
     int ndims   = 2;
     int dims[]  = {10, 9};
@@ -620,7 +615,6 @@ TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-#endif
 }
 
 TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) {
@@ -667,9 +661,8 @@ TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-// FIXME_SYCL deadlocks
 // WORKAROUND OPENMPTARGET : death tests don't seem to work ...
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
   return;
 #endif
   {  // Assignment of rank-2 LayoutLeft = LayoutStride (LayoutRight compatible)
@@ -823,9 +816,8 @@ TEST(TEST_CATEGORY_DEATH, view_layoutstride_left_to_layoutright_assignment) {
     ASSERT_EQ(dst.span(), src.span());
     ASSERT_EQ(test, true);
   }
-// FIXME_SYCL deadlocks
 // WORKAROUND OPENMPTARGET : death tests don't seem to work ...
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
   return;
 #endif
   {  // Assignment of rank-2 LayoutRight = LayoutStride (LayoutLeft compatible)
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
index 15d2976499d4533ce632bfefb654895fc0c7c9eb..fdbda099176c79410c1be6599546f09aba3269dc 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -905,8 +905,6 @@ void test_view_mapping() {
         Kokkos::Impl::ViewCtorProp<int*>(nullptr), stride);
   }
 
-  // FIXME_SYCL requires MDRangePolicy
-#ifndef KOKKOS_ENABLE_SYCL
   {
     using V           = Kokkos::View<int**, Space>;
     using M           = typename V::HostMirror;
@@ -1033,7 +1031,6 @@ void test_view_mapping() {
     ASSERT_EQ(d.extent(0), 7);
     ASSERT_EQ(d.extent(1), 8);
   }
-#endif
 
   {
     using V = Kokkos::View<int*, Space>;
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
index b28f09934d598881e35748ef86a60771cfb42e81..0125017d93786101e2a23a866effe9d8a5e5242d 100644
--- a/packages/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -2036,8 +2036,6 @@ template <class Space, class MemTraits = void>
 void test_layoutleft_to_layoutleft() {
   Impl::test_subview_legal_args_left();
 
-  // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
   using view3D_t = Kokkos::View<int***, Kokkos::LayoutLeft, Space>;
   using view4D_t = Kokkos::View<int****, Kokkos::LayoutLeft, Space>;
   {
@@ -2075,15 +2073,12 @@ void test_layoutleft_to_layoutleft() {
                                                                   1);
     check.run();
   }
-#endif
 }
 
 template <class Space, class MemTraits = void>
 void test_layoutright_to_layoutright() {
   Impl::test_subview_legal_args_right();
 
-  // FIXME_SYCL requires MDRange policy
-#ifndef KOKKOS_ENABLE_SYCL
   using view3D_t = Kokkos::View<int***, Kokkos::LayoutRight, Space>;
   using view4D_t = Kokkos::View<int****, Kokkos::LayoutRight, Space>;
   {
@@ -2107,7 +2102,6 @@ void test_layoutright_to_layoutright() {
                                                                   0);
     check.run();
   }
-#endif
 }
 //----------------------------------------------------------------------------
 
@@ -2139,6 +2133,51 @@ void test_unmanaged_subview_reset() {
 
 //----------------------------------------------------------------------------
 
+template <std::underlying_type_t<Kokkos::MemoryTraitsFlags> MTF>
+struct TestSubviewMemoryTraitsConstruction {
+  void operator()() const noexcept {
+    using view_type          = Kokkos::View<double*, Kokkos::HostSpace>;
+    using size_type          = view_type::size_type;
+    using memory_traits_type = Kokkos::MemoryTraits<MTF>;
+
+    view_type v("v", 7);
+    for (size_type i = 0; i != v.size(); ++i) v[i] = static_cast<double>(i);
+
+    std::pair<int, int> range(3, 5);
+    auto sv = Kokkos::subview<memory_traits_type>(v, range);
+
+    ASSERT_EQ(2u, sv.size());
+    EXPECT_EQ(3., sv[0]);
+    EXPECT_EQ(4., sv[1]);
+  }
+};
+
+inline void test_subview_memory_traits_construction() {
+  // Test all combinations of MemoryTraits:
+  // Unmanaged (1)
+  // RandomAccess (2)
+  // Atomic (4)
+  // Restricted (8)
+  TestSubviewMemoryTraitsConstruction<0>()();
+  TestSubviewMemoryTraitsConstruction<1>()();
+  TestSubviewMemoryTraitsConstruction<2>()();
+  TestSubviewMemoryTraitsConstruction<3>()();
+  TestSubviewMemoryTraitsConstruction<4>()();
+  TestSubviewMemoryTraitsConstruction<5>()();
+  TestSubviewMemoryTraitsConstruction<6>()();
+  TestSubviewMemoryTraitsConstruction<7>()();
+  TestSubviewMemoryTraitsConstruction<8>()();
+  TestSubviewMemoryTraitsConstruction<9>()();
+  TestSubviewMemoryTraitsConstruction<10>()();
+  TestSubviewMemoryTraitsConstruction<11>()();
+  TestSubviewMemoryTraitsConstruction<12>()();
+  TestSubviewMemoryTraitsConstruction<13>()();
+  TestSubviewMemoryTraitsConstruction<14>()();
+  TestSubviewMemoryTraitsConstruction<15>()();
+}
+
+//----------------------------------------------------------------------------
+
 template <class T>
 struct get_view_type;
 
diff --git a/packages/kokkos/core/unit_test/TestView_64bit.hpp b/packages/kokkos/core/unit_test/TestView_64bit.hpp
index 7dc47ccb0f299133a30f042626c2f45151de19ee..50626718b5774ddefa03a453402564986e831ed1 100644
--- a/packages/kokkos/core/unit_test/TestView_64bit.hpp
+++ b/packages/kokkos/core/unit_test/TestView_64bit.hpp
@@ -49,7 +49,12 @@ namespace Test {
 template <class Device>
 void test_64bit() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-  int64_t N   = 5000000000;
+  // FIXME_SYCL The SYCL CUDA backend throws an error
+#ifdef KOKKOS_ENABLE_SYCL
+  int64_t N = 1000000000;
+#else
+  int64_t N = 5000000000;
+#endif
   int64_t sum = 0;
   {
     Kokkos::parallel_reduce(
diff --git a/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp b/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp
index 4c16147a360c0f6797cd4d7c3b3b541a75470b03..6af731b9fa3e037598123add65071c1efa341187 100644
--- a/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp
+++ b/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp
@@ -46,12 +46,14 @@
 
 namespace Test {
 
+#ifndef KOKKOS_ENABLE_SYCL
 __global__ void offset_streams(int* p) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < 100) {
     p[idx] += idx;
   }
 }
+#endif
 
 template <typename MemorySpace>
 struct FunctorRange {
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestCudaUVM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCudaUVM_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestCudaUVM_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestCudaUVM_Category.hpp
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp
similarity index 98%
rename from packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp
index 6831200df9068a3f93ca54bdd7977d3a621d8b0c..22666dc82fab611ee08aa7555e9b56ae0b2f148a 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp
@@ -48,6 +48,7 @@
 #include <gtest/gtest.h>
 
 #define TEST_CATEGORY cuda
+#define TEST_CATEGORY_NUMBER 5
 #define TEST_CATEGORY_DEATH cuda_DeathTest
 #define TEST_EXECSPACE Kokkos::Cuda
 #define TEST_CATEGORY_FIXTURE(name) cuda_##name
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestDefaultDeviceType_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/default/TestDefaultDeviceType_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestDefaultDeviceType_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestHIPHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestHIPHostPinned_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestHIP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestHIP_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestHPX_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestHPX_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestOpenMPTarget_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestOpenMPTarget_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestOpenMP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenMP_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestOpenMP_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestOpenMP_Category.hpp
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
similarity index 91%
rename from packages/kokkos/core/unit_test/hip/TestHIP_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
index 8cae165c3374330148023ac58d832b317692a752..1ec89fc61a594989f58b5076af6477be051183e8 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
@@ -42,12 +42,12 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_TEST_HIP_HPP
-#define KOKKOS_TEST_HIP_HPP
+#ifndef KOKKOS_TEST_SYCL_SHARED_USM_SPACE_HPP
+#define KOKKOS_TEST_SYCL_SHARED_USM_SPACE_HPP
 
 #include <gtest/gtest.h>
 
-#define TEST_CATEGORY hip
-#define TEST_EXECSPACE Kokkos::Experimental::HIP
+#define TEST_CATEGORY sycl_shared_usm
+#define TEST_EXECSPACE Kokkos::Experimental::SYCLSharedUSMSpace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/TestSYCL_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
similarity index 98%
rename from packages/kokkos/core/unit_test/TestSYCL_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
index cd4c0ed22aac43d594df917de084bac9869bf737..345f40d1c39f403dd62369c8cfa668ed1c75a951 100644
--- a/packages/kokkos/core/unit_test/TestSYCL_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
@@ -48,6 +48,7 @@
 #include <gtest/gtest.h>
 
 #define TEST_CATEGORY sycl
+#define TEST_CATEGORY_NUMBER 7
 #define TEST_EXECSPACE Kokkos::Experimental::SYCL
 
 #endif
diff --git a/packages/kokkos/core/unit_test/TestSerial_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSerial_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestSerial_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestSerial_Category.hpp
diff --git a/packages/kokkos/core/unit_test/TestThreads_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/TestThreads_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
index cbc8894203b8d0bfde1783fd19b0cd153a7be33c..4228b5181a0ccd68dfde87f71f92fd0a471a8e96 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestSharedAlloc.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
index eeb9f3fa3a44ac2f992f95477098fb938cbaea1b..316a2b5d0fe0dba2c9b74f3f6f7a6d61342d2c4c 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
index 04949cf5739e9ebff78acf5ed025adb8eaea1679..5eed2ca0d77b828b2431bfce0fe69c4da457bb95 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
index bf259ef734130780ab84b874fec1ea927582559d..26dc9b0e000096ab1809412c4a29fc563844cbd1 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
index 84d81e3b41fc843522ef4f5a1969d9a4ae7b5131..bab29610a3d4ad2e812405ba96ed06c7e2dfb3b8 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
index db9e990ae5a6738cbcfb100670709ee7ef65254b..fd227186d5668239b9d9fe3f6a1ae2b3d5510b32 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
index 8f5fd4e3e89f257abfb2190934bdd46da1602646..669761df979cfd1458f1d5ea78acfb5738af0d38 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
index 8d288cf71538d8f9cefe9f81f11ccf329de22519..d367fd7e051f49495ce747f6f490bad795f94d86 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
index 923f4df965c597d87b3f92abfa26b69dcd9a7f8b..01b284b2f562299b4f23cc197693c2baad40f38e 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
index 1efe65b21db58392bad6c7ef36b3808c763d3f81..e15228b1d772a5dba97ee434e17fdb18188a709a 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
index 9e1034c5b72b80592d647aae17fd35dd77faf8ba..52bbd42f292f4b865def36856913dfc6bbe0028f 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestCudaHostPinned_Category.hpp>
 #include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
index 5024b30f90cb559d5278e7e217ace4d84da928ef..6602d7396a7c2fdec7e16e83079764962dbeab75 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestSharedAlloc.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
index 3a48b2495e716bfd4b92bbf9a76ce91be399b4ea..4aeac8f13f4d28672c671a51c1eacfedbf0e92fd 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
index 7f0effb5e8d8a1538f656798a956f6edf967d5bd..e5cb0103424fd022290998307f086aedaea0cb29 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
index 9d8cbf0c773f8c5ff293178cf7f9b3fc092a6fe1..a52fcb833ed2a0e959a25e36195460c1ed914a78 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
index 8d2b5268bf5394e76e0ea8db7e0bce9b04fbb383..e345cd9667526671ef898a0d1247343b47f6296c 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
index 5c11b2a322ca10a0251476babb82a7a1207253e4..61547df4f523969f8c93da8315fddb4467e5ade9 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
index 3344c71136195386679c151aab13fb53715172f4..75a769bb947485e6e7459c1cb95b7b3b1c26f9b1 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
index 1234a4b649474ea13ce756e5a49e7bd8dc47c722..7d09f5c9f397b3723599aec64c3c50a6aa77a769 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
index aa9b513d7aab8033dfbe8595f8ae3ebe8c9ad2dc..ea03f43bd69a318095e6277f4db226241fc9a482 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
index f24e9ba4f1f0432eace053ca72b5ecf577c2d231..1f754e8f4996cbc3c0fbefd7000bff65451b19f0 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
index 47b7d1f46cbab75ae8a6215fd72094472e7240c5..4af7057d2aa47db99a8325159e0ee737feff7767 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestCudaUVM_Category.hpp>
 #include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp
index 4313cece0ca18ef120966230666eb16c847dc986..5b6fccdbd0a500cbb0d45574879a797c866d1b55 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp
index 5472ef9fd169a5a43347e98519c8ab32e04ca5eb..f1d3dfc5245d971b6b90ca3ef11731e34b538f67 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp
index 38f4336a86c499723c21a985f7a8445a81df74d1..77b1e58a1586482b029f89298c7273cfccc95a7d 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 #include <TestGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index 6073b9cd7eff97cf67122086031ff95c00304a8c..ee7181e1180fdb887a87190605565e42e897409c 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -43,7 +43,9 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
+
+#include <array>
 
 namespace Test {
 
@@ -58,7 +60,7 @@ __global__ void offset(int* p) {
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
   int* p;
-  cudaMalloc(&p, sizeof(int) * 100);
+  CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,8 +72,8 @@ TEST(cuda, raw_cuda_interop) {
   offset<<<100, 64>>>(p);
   CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
-  int* h_p = new int[100];
-  cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault);
+  std::array<int, 100> h_p;
+  cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault);
   CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
@@ -81,5 +83,6 @@ TEST(cuda, raw_cuda_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
+  CUDA_SAFE_CALL(cudaFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 57c0e454d32ee89f03484ba95f46f7bcc1b5d190..526b985c00f2eec2eab6cafb8e862eff5024d575 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 #include <Test_InterOp_Streams.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index d68ffb0865507c356247a8b3a5fcad6f9090b019..646b37908654d2af6327158cb49f7d4257e8f8bf 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp
index e2591c9b127518b21e96be95a02297fa5696294a..42fa615bc6f65f0661ceaad12c3613781a133a52 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 #include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp
index 93225377d341721ba6d700cc741a4bb280822af0..eb9077aaf423b2bf9bdfa919d4d45cd18805d069 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
 #include <Kokkos_Core.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp
index a80aded124fac55b6324697e7d043cb5b085c35e..b312f42b24369a725a44bdd1de1a2771e794959f 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp
@@ -47,7 +47,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
index c4320a137e5fc7651aab44cc4fe73d8e171f8561..5dcbe566e299c0f013843216b0854dc51582dd6d 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -45,7 +45,7 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestHalfConversion.hpp>
 #include <TestHalfOperators.hpp>
 
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
index bc048596a07fed0e0c8c9fb9e8a30fd231228c19..7f53034557dca1c06bcbc6588ff7fdce6ddbb4c4 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
@@ -45,8 +45,6 @@
 #include <gtest/gtest.h>
 #include "TestResize.hpp"
 
-// FIXME_SYCL requires parallel_for
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(kokkosresize, host_space_access) {
@@ -57,4 +55,3 @@ TEST(kokkosresize, host_space_access) {
 }
 
 }  // namespace Test
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp
index 248b074fd502a18be61e606180ba3df42388c130..9b57de712183a7ff1fd72533f578c25947901f39 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_a1) {
@@ -66,4 +59,3 @@ TEST(defaultdevicetype, reduce_instantiation_a1) {
 
 }  // namespace Test
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp
index 663371a765a1b8d9f9cbc29a462c5bbb9e6038dc..314891433693df21689fedb2827dc8d614896383 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_a2) {
@@ -67,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_a2) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp
index 948d2586bd6043b7ae17bb51c6ef1467e3f55d2c..f9e36e298a227281683ceae0bf5bfe9179a6b8d5 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp
@@ -46,11 +46,9 @@
 
 #include <Kokkos_Core.hpp>
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
 namespace Test {
@@ -62,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_a3) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp
index 449c8cd5622eb1fb55f1bf230c9dbdedbd028561..1a34bef874f75ecf4a75a99166f7abc66ec1804c 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_b1) {
@@ -66,4 +59,3 @@ TEST(defaultdevicetype, reduce_instantiation_b1) {
 
 }  // namespace Test
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp
index 551bd6c20739bfd6bcd536506fab2bd8f214dc5d..8bd7628243ab0e3c073d797cc2817ec2a4ba1185 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_b2) {
@@ -67,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_b2) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp
index e1c5b16fee2e28a32e3404eab029fd4a46ff7ed9..bc1d763437d1f4d23ae688fe94c16fcd7f9367f9 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp
@@ -46,11 +46,9 @@
 
 #include <Kokkos_Core.hpp>
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
 namespace Test {
@@ -62,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_b3) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp
index 6e7587781f023d282bfdef4d769ecd7867d3beb8..ba4cca46fbb9d0883691a40ee53b8a11c739b4c5 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_c1) {
@@ -66,4 +59,3 @@ TEST(defaultdevicetype, reduce_instantiation_c1) {
 
 }  // namespace Test
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp
index ae4f1082a2ab27eebe04afd405843dc770eec152..0459f98dddb20e7cae811502e4ebf5518b011c6b 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp
@@ -48,16 +48,9 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
-// FIXME_SYCL
-// C++ exception with description "Global_work_size not evenly divisible by
-// local_work_size. Non-uniform work-groups are not allowed by default.
-// Underlying OpenCL 2.x implementation supports this feature and to enable it,
-// build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)"
-// thrown in the test body.
-#ifndef KOKKOS_ENABLE_SYCL
 namespace Test {
 
 TEST(defaultdevicetype, reduce_instantiation_c2) {
@@ -67,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_c2) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp
index 92a956bee7f39046851d4752f580fe56295d50df..801dee83bbe16b6b25398b27068e5d8a3b3d29e2 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp
@@ -46,11 +46,9 @@
 
 #include <Kokkos_Core.hpp>
 
-// FIXME_SYCL requires TeamPolicy
-#ifndef KOKKOS_ENABLE_SYCL
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestReduceCombinatorical.hpp>
 
 namespace Test {
@@ -62,4 +60,3 @@ TEST(defaultdevicetype, reduce_instantiation_c3) {
 }  // namespace Test
 
 #endif
-#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
index ff87b7802cfdababbd3ce6994f8f794d44bb4d24..bcd49e69bd3af022ede0ca0a188066288c9b1d35 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -48,13 +48,11 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestDefaultDeviceType_Category.hpp>
 #include <TestUtilities.hpp>
 
 namespace Test {
 
-TEST(defaultdevicetype, test_utilities) { test_utilities(); }
-
 TEST(defaultdevicetype, malloc) {
   int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int));
   ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int)));
diff --git a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
index 485dd4d112271bc1897ab96f31c291dde1f4f96c..20b295650a610a601d73e88b2b116e5dda34c324 100644
--- a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt
@@ -1,25 +1,19 @@
 # Create tests that contain each header separately. We do not  run these tests
 # but we just try to compile them.
-if(NOT KOKKOS_HAS_TRILINOS)
-  # Globbing all the header filenames to test for self-containment and presence of header guards
-   KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space")
-   KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space")
-   KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space")
-   KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space")
-   # Globbing all the header filenames to test for self-containment and presence of header guards
-   SET(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../")
-   file(GLOB KOKKOS_CORE_HEADERS RELATIVE ${BASE_DIR}/core/src
-        ${BASE_DIR}/core/src/*.hpp ${BASE_DIR}/core/src/*.h)
-   file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src
-        ${BASE_DIR}/containers/src/*.hpp)
-   file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE  ${BASE_DIR}/algorithms/src
-        ${BASE_DIR}/algorithms/src/*.hpp)
 
-   foreach (_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS})
-      string(REGEX REPLACE "[\./]" "_" header_test_name ${_header})
-      set(header_test_name Kokkos_HeaderSelfContained_${header_test_name})
-      add_executable(${header_test_name} tstHeader.cpp)
-      target_link_libraries(${header_test_name} PRIVATE Kokkos::kokkos)
-      target_compile_definitions(${header_test_name} PRIVATE KOKKOS_HEADER_TEST_NAME=${_header})
-   endforeach()
-endif()
+# Globbing all the header filenames to test for self-containment and presence of header guards
+SET(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../")
+file(GLOB KOKKOS_CORE_HEADERS RELATIVE ${BASE_DIR}/core/src
+     ${BASE_DIR}/core/src/*.hpp ${BASE_DIR}/core/src/*.h)
+file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src
+     ${BASE_DIR}/containers/src/*.hpp)
+file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE  ${BASE_DIR}/algorithms/src
+     ${BASE_DIR}/algorithms/src/*.hpp)
+
+foreach (_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS})
+  string(REGEX REPLACE "[\./]" "_" header_test_name ${_header})
+  set(header_test_name Kokkos_HeaderSelfContained_${header_test_name})
+  add_executable(${header_test_name} tstHeader.cpp)
+  target_link_libraries(${header_test_name} PRIVATE Kokkos::kokkos)
+  target_compile_definitions(${header_test_name} PRIVATE KOKKOS_HEADER_TEST_NAME=${_header})
+endforeach()
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_Category.hpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_Category.hpp
deleted file mode 100644
index 12c69926c7bfc10ec7fef02d9e96c39691c557d6..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_Category.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_HIPHOSTPINNED_HPP
-#define KOKKOS_TEST_HIPHOSTPINNED_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY hip_hostpinned
-#define TEST_EXECSPACE Kokkos::Experimental::HIPHostPinnedSpace
-
-#endif
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
index 53f0371b23dfd0cd32044ba70a4b106f2f7b9a45..02157836b3f6075c6c18e2919d93ed4b541dbab8 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
index 2e3685d6102d75b8516846f80bc4c3fb959a93b7..80e2fe3f93716c23979ede23aa81de9b2f694c9e 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
index 079a244d43ee4e2570dc85cdb6bc6d8957769d55..9694e33ca0ce0f5c2fc6214613f4ae2f03c9750d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
index bc3843b2c1dcc0eff3282c40426af11c1a7e8098..0d773494ac6236ce0274cc844fb3369aec81d51d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
index 1c80e05fe0f9f5671f6c394f2ab37ba9fece0d48..cbbbc810b0e8e588be2892b83279a4137675de66 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
index b630ad7016a194e5f9d290e0a5e04e66da571b21..444a3e6e95d2a62c1ad0e8bedba3767503dd4687 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
index 1a88e7ebc6dec2bf9d23fee4b210eaea8a9d6869..f1f90e7acf13c7aaa4820f5bd50ecc403f2d6f5f 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
index f9b5608d1b2b7a7a413d24b31e5f2f09e832fc54..5e83121e341db1da440c65cd5dce84dc1a6f6259 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
index bff68fc07253e3602412a133abc61ee5f4a4e062..c024143d6c7b735dfa3b897e0a4503ee50e4caec 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
index cfa9da549593e817b3eb79cd33fd75ce87953e73..dcd6c1dc435982fdf44950c3b606847c29c30b37 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
@@ -42,5 +42,5 @@
 //@HEADER
 */
 
-#include <hip/TestHIPHostPinned_Category.hpp>
+#include <TestHIPHostPinned_Category.hpp>
 #include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/algorithms/unit_tests/TestThreads.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
similarity index 61%
rename from packages/kokkos/algorithms/unit_tests/TestThreads.cpp
rename to packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
index c75e6e8dfba9f8d69617b8ff44b4c095a9e55537..0a243e0e8e89c0ef5a7cec6195837909d092bc2a 100644
--- a/packages/kokkos/algorithms/unit_tests/TestThreads.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
@@ -42,47 +42,48 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_THREADS
-
-#include <gtest/gtest.h>
-
 #include <Kokkos_Core.hpp>
-
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
-#include <iomanip>
-
-//----------------------------------------------------------------------------
+#include <TestHIP_Category.hpp>
 
 namespace Test {
 
-#define THREADS_RANDOM_XORSHIFT64(num_draws)                             \
-  TEST(threads, Random_XorShift64) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Threads> >( \
-        num_draws);                                                      \
-  }
+struct TestAsyncLauncher {
+  size_t *m_flag;
+  size_t m_value;
 
-#define THREADS_RANDOM_XORSHIFT1024(num_draws)                             \
-  TEST(threads, Random_XorShift1024) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Threads> >( \
-        num_draws);                                                        \
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int /*i*/) const {
+    // and update flag
+    Kokkos::atomic_add(m_flag, m_value);
   }
 
-#define THREADS_SORT_UNSIGNED(size)                 \
-  TEST(threads, SortUnsigned) {                     \
-    Impl::test_sort<Kokkos::Threads, double>(size); \
-  }
+  TestAsyncLauncher(size_t *flag, int value) : m_flag(flag), m_value(value) {}
 
-THREADS_RANDOM_XORSHIFT64(10240000)
-THREADS_RANDOM_XORSHIFT1024(10130144)
-THREADS_SORT_UNSIGNED(171)
+  void run() {
+    Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this);
+  }
+};
 
-#undef THREADS_RANDOM_XORSHIFT64
-#undef THREADS_RANDOM_XORSHIFT1024
-#undef THREADS_SORT_UNSIGNED
+TEST(hip, async_launcher) {
+  size_t *flag;
+  HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
+  HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
+  // launch # of cycles * 1000 kernels w/ distinct values
+  auto space        = Kokkos::Experimental::HIP();
+  auto instance     = space.impl_internal_space_instance();
+  size_t max_cycles = instance->m_maxDriverCycles;
+  size_t nkernels   = max_cycles * 1000;
+  for (size_t i = 0; i < nkernels; ++i) {
+    TestAsyncLauncher(flag, i).run();
+  }
+  // and check results -- if any of the driver types were overwritten
+  // the sum below should fail
+  instance->fence();
+  size_t h_flag;
+  HIP_SAFE_CALL(
+      hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice));
+  ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2);
+  HIP_SAFE_CALL(hipFree(flag));
+}
 
 }  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTTHREADS_PREVENT_LINK_ERROR() {}
-#endif
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 9a58c99d2db39af91a8d61ec28be65f19fa9c9f2..3a76ca148cf683a83b84d351e4ebd8b2f7cdec94 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -43,7 +43,9 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hip/TestHIP_Category.hpp>
+#include <TestHIP_Category.hpp>
+
+#include <array>
 
 namespace Test {
 
@@ -70,8 +72,8 @@ TEST(hip, raw_hip_interop) {
   offset<<<dim3(100), dim3(100), 0, nullptr>>>(p);
   HIP_SAFE_CALL(hipDeviceSynchronize());
 
-  int* h_p = new int[100];
-  HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
+  std::array<int, 100> h_p;
+  HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
   HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
@@ -81,5 +83,6 @@ TEST(hip, raw_hip_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
+  HIP_SAFE_CALL(hipFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 8363765e36af1c1a73f8ae4c6b50ec1da712b6b8..8e0880ddbd0b15524be75ab97b90044e5315a8ff 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-#include <hip/TestHIP_Category.hpp>
+#include <TestHIP_Category.hpp>
 #include <Test_InterOp_Streams.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
index 73e9dec10f3d70fdb7a88caa262165f5182ce52e..b759d6f407a791fb3b88b86f502cc956780294f3 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
@@ -44,7 +44,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hip/TestHIP_Category.hpp>
+#include <TestHIP_Category.hpp>
 
 struct DummyFunctor {
   using value_type = int;
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index f13400d096db8682083293ad56c08c57f1cbbea0..ae1de8ea2d304e41d672ff2e136d16c86cbb8068 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hip/TestHIP_Category.hpp>
+#include <TestHIP_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
index ac729dbc055a52b7a431f92ff66a4493f8690ba0..db360a99d3d60977cf06479e7662e21350dd5f99 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-#include <hip/TestHIP_Category.hpp>
+#include <TestHIP_Category.hpp>
 #include <Kokkos_Core.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_Category.hpp b/packages/kokkos/core/unit_test/hpx/TestHPX_Category.hpp
deleted file mode 100644
index bbdcfba5c7af2e7cd5c1734b8427bd724f5e240b..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_Category.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_HPX_HPP
-#define KOKKOS_TEST_HPX_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY hpx
-#define TEST_CATEGORY_DEATH hpx_DeathTest
-#define TEST_EXECSPACE Kokkos::Experimental::HPX
-
-#endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
index a235e86ba49ec345587d4a6d849b100aedca98af..8e89d6d6a5da981b33eea9349ae3ace63ec3f684 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 
 #include <hpx/config.hpp>
 #include <hpx/include/lcos.hpp>
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
index 4f5569fc6b91192bf995ee0a7225ac0ab809e45f..0cedc068e594e70d750c9b515c4e08cbe527a1f4 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 
 #include <hpx/include/lcos.hpp>
 
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
index 26f419db86cf14560b7c3444c12e5abf8b5b26a0..de4cb01a7835d8b4e3d29920ed572edeeb9ef3fb 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
index 89b03dc3677365dc321334667abf1ebd22df1678..a98c8b0d62339fa5c2e68124984d5b790b14f692 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
index 872d1a0383802c90f99557c8ef51c57dd19be918..31c35ac9a7f0a3425948157cb7f2d3a4239691ad 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp
index 4e059beef41dec1286db5062dd031e9d5e4084ef..57d0ac803bcd86b5499dd6c29348d88138088c15 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 #include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index 449b450a7c5508392cf0316689fd9c6a844d950d..5bf1860d8e4a6bcf739656bdc7e1f790ebf60512 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-// @Kokkos_Feature_Level_Required:13
+// @Kokkos_Feature_Level_Required:12
 // Unit test for hierarchical parallelism
 // Create concurrent work hierarchically and verify if
 // contributions of paticipating processing units corresponds to expected value
@@ -63,10 +63,12 @@ struct ThreadScratch {
 
   int sX, sY;
   data_t v;
+
+  const int scratch_level = 1;
   KOKKOS_FUNCTION
   void operator()(const team_t &team) const {
     // Allocate and use scratch pad memory
-    scratch_t v_S(team.thread_scratch(1), sY);
+    scratch_t v_S(team.thread_scratch(scratch_level), sY);
     int n = team.league_rank();
 
     for (int i = 0; i < sY; ++i) v_S(i) = 0;
@@ -90,8 +92,9 @@ struct ThreadScratch {
 
     int scratchSize = scratch_t::shmem_size(sY);
     // So this works with deprecated code enabled:
-    policy_t policy = policy_t(pN, Kokkos::AUTO)
-                          .set_scratch_size(1, Kokkos::PerThread(scratchSize));
+    policy_t policy =
+        policy_t(pN, Kokkos::AUTO)
+            .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
 
     int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
     v                 = data_t("Matrix", pN, max_team_size);
@@ -99,7 +102,7 @@ struct ThreadScratch {
     Kokkos::parallel_for(
         "Test12a_ThreadScratch",
         policy_t(pN, max_team_size)
-            .set_scratch_size(1, Kokkos::PerThread(scratchSize)),
+            .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize)),
         *this);
 
     Kokkos::fence();
@@ -117,9 +120,18 @@ struct ThreadScratch {
 
 TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) {
   ThreadScratch<TEST_EXECSPACE> test;
+  // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to
+  // pass in the Release and RelWithDebInfo builds. Does not need the team_size
+  // to be a multiple of 32 for the Debug builds.
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  test.run(1, 32, 9);
+  test.run(2, 64, 22);
+  test.run(14, 128, 321);
+#else
   test.run(1, 55, 9);
   test.run(2, 4, 22);
   test.run(14, 277, 321);
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index 913dce9995d1f0998c85f23ab174a6b8e8cdae80..b34f652e76d919f14c3afed0656b8bcd86dbc27f 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-// @Kokkos_Feature_Level_Required:13
+// @Kokkos_Feature_Level_Required:12
 // Unit test for hierarchical parallelism
 // Create concurrent work hierarchically and verify if
 // contributions of paticipating processing units corresponds to expected value
@@ -64,13 +64,15 @@ struct TeamScratch {
                                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
     int scratchSize = scratch_t::shmem_size(sX, sY);
 
+    const int scratch_level = 1;
+
     Kokkos::parallel_for(
         "Team",
         policy_t(pN, Kokkos::AUTO)
-            .set_scratch_size(1, Kokkos::PerTeam(scratchSize)),
+            .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)),
         KOKKOS_LAMBDA(const team_t &team) {
           // Allocate and use scratch pad memory
-          scratch_t v_S(team.team_scratch(1), sX, sY);
+          scratch_t v_S(team.team_scratch(scratch_level), sX, sY);
           int n = team.league_rank();
 
           Kokkos::parallel_for(
@@ -105,9 +107,18 @@ struct TeamScratch {
 
 TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) {
   TeamScratch<TEST_EXECSPACE> test;
+  // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to
+  // pass in the Release and RelWithDebInfo builds. Does not need the team_size
+  // to be a multiple of 32 for the Debug builds.
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  test.run(1, 32, 4);
+  test.run(4, 64, 10);
+  test.run(14, 128, 20);
+#else
   test.run(1, 4, 4);
   test.run(4, 7, 10);
   test.run(14, 277, 321);
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp
index 20f0b7884ed1b0f5063ef9623377eabb97722c1c..e32b0ed0fc92684072cf004b64240093e1b981fd 100644
--- a/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-// @Kokkos_Feature_Level_Required:12
+// @Kokkos_Feature_Level_Required:13
 // Unit test for hierarchical parallelism
 // Create concurrent work hierarchically and verify if
 // sum of created processing units corresponds to expected value
diff --git a/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp
index 6b640632399cb9d2a23c68c491532e84c7b3afd3..0d37703e2b73d5ca22e73f2bfbd2f553e1fe0225 100644
--- a/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-// @Kokkos_Feature_Level_Required:12
+// @Kokkos_Feature_Level_Required:13
 // Unit test for hierarchical parallelism
 // Create concurrent work hierarchically and verify if
 // sum of created processing units corresponds to expected value
diff --git a/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp
index aa82dd1f3996e2786ab253b59f9b356e5f0e3ef3..26f9d000914393a8af86d9ba1bc4bb5658a7244e 100644
--- a/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-// @Kokkos_Feature_Level_Required:12
+// @Kokkos_Feature_Level_Required:13
 // Unit test for hierarchical parallelism
 // Create concurrent work hierarchically and verify if
 // sum of created processing units corresponds to expected value
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp
deleted file mode 100644
index 65efbc9b9713f8db87a36069c2416b0da4ee2554..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_OMP_HPP
-#define KOKKOS_TEST_OMP_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY openmp
-#define TEST_CATEGORY_DEATH openmp_DeathTest
-#define TEST_EXECSPACE Kokkos::OpenMP
-#define TEST_CATEGORY_FIXTURE(name) openmp_##name
-
-#endif
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp
index 9ed647c287db884e0b5a058e8adbd10625c71cb2..e5ba9e8738275b4163a787518678c6615f91f0f7 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <openmp/TestOpenMP_Category.hpp>
+#include <TestOpenMP_Category.hpp>
 #include <TestGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
index 083c94860ba7018f4418a8c2eff7112afbaaf063..c3ee67673912bb8c8f022d03322d6e8b69adfd72 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <openmp/TestOpenMP_Category.hpp>
+#include <TestOpenMP_Category.hpp>
 #include <omp.h>
 
 namespace Test {
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
index ea1472b7575e9e135f098640269d1638f3a412f1..902150da5806d27768603ac71207ce2aaef5551f 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp
@@ -43,7 +43,7 @@
 //@HEADER
 */
 
-#include <openmp/TestOpenMP_Category.hpp>
+#include <TestOpenMP_Category.hpp>
 #include <Kokkos_Core.hpp>
 
 #include <mutex>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp
index 5e67a34710f1219eb0a13d90b686faa7c6938321..2ddc6a58419040f912ebbd0f9d4f60ae113b9368 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <openmp/TestOpenMP_Category.hpp>
+#include <TestOpenMP_Category.hpp>
 #include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp
deleted file mode 100644
index 58aa0cc782080e255264d57b5f9838fb60ab83cd..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_OMPTARGET_HPP
-#define KOKKOS_TEST_OMPTARGET_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY openmptarget
-#define TEST_CATEGORY_DEATH openmptarget_DeathTest
-#define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget
-
-#endif
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp b/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp
deleted file mode 100644
index d7ae8a9f48afbae46a90cfaad26fff6621d45570..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_SERIAL_HPP
-#define KOKKOS_TEST_SERIAL_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY serial
-#define TEST_CATEGORY_DEATH serial_DeathTest
-#define TEST_EXECSPACE Kokkos::Serial
-#define TEST_CATEGORY_FIXTURE(name) serial_##name
-
-#endif
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp
index 5933b1b86f9b782f4c42d850af5c60b95186133e..b2dba1c265cab5cfa4b982bf43f920ec666fcaa5 100644
--- a/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <serial/TestSerial_Category.hpp>
+#include <TestSerial_Category.hpp>
 #include <TestGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp
index 02f686e069dcd09e52c3a56cdd3d8cb8b0cde3d1..c08efbf447b6fe055f7f01e619b2a0b02de0cdf8 100644
--- a/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <serial/TestSerial_Category.hpp>
+#include <TestSerial_Category.hpp>
 #include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp b/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp
index 74e28a17de71d49f0287a4460e1238a23798d2e7..c1f7398c166bcf738111b1674a83a919293faf6d 100644
--- a/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp
+++ b/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp
@@ -48,23 +48,29 @@
 #include <Kokkos_Core.hpp>
 
 #ifdef KOKKOS_ENABLE_CUDA
-#include <cuda/TestCuda_Category.hpp>
+#include <TestCuda_Category.hpp>
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+#include <TestHIP_Category.hpp>
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+#include <TestSYCL_Category.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_OPENMP
-#include <openmp/TestOpenMP_Category.hpp>
+#include <TestOpenMP_Category.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_THREADS
-#include <threads/TestThreads_Category.hpp>
+#include <TestThreads_Category.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_HPX
-#include <hpx/TestHPX_Category.hpp>
+#include <TestHPX_Category.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestOpenMPTarget_Category.hpp>
 #endif
 #ifndef TEST_EXECSPACE
 #ifdef KOKKOS_ENABLE_SERIAL
-#include <serial/TestSerial_Category.hpp>
+#include <TestSerial_Category.hpp>
 #endif
 #endif
 #include <TestReducers_d.hpp>
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Random.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
similarity index 66%
rename from packages/kokkos/algorithms/unit_tests/TestOpenMP_Random.cpp
rename to packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 1ca8e0a828f06176c0959c744fe20045856534b2..018855963d35f8fef81a93985811dcc3d9b239fc 100644
--- a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Random.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -42,36 +42,47 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_OPENMP
-
-#include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
 
-//----------------------------------------------------------------------------
-#include <TestRandom.hpp>
-#include <iomanip>
+#include <array>
 
 namespace Test {
 
-#define OPENMP_RANDOM_XORSHIFT64(num_draws)                             \
-  TEST(openmp, Random_XorShift64) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >( \
-        num_draws);                                                     \
-  }
+// Test whether allocations survive Kokkos initialize/finalize if done via Raw
+// SYCL.
+TEST(sycl, raw_sycl_interop) {
+  sycl::default_selector device_selector;
+  sycl::queue queue(device_selector);
+  constexpr int n = 100;
+  int* p          = sycl::malloc_device<int>(n, queue);
 
-#define OPENMP_RANDOM_XORSHIFT1024(num_draws)                             \
-  TEST(openmp, Random_XorShift1024) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >( \
-        num_draws);                                                       \
+  Kokkos::InitArguments arguments{-1, -1, -1, false};
+  Kokkos::initialize(arguments);
+  {
+    TEST_EXECSPACE space(queue);
+    Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
+    Kokkos::deep_copy(space, v, 5);
   }
+  Kokkos::finalize();
+
+  queue.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<1>(n), [=](int idx) { p[idx] += idx; });
+  });
+  queue.wait_and_throw();
 
-OPENMP_RANDOM_XORSHIFT64(10240000)
-OPENMP_RANDOM_XORSHIFT1024(10130144)
+  std::array<int, n> h_p;
+  queue.memcpy(h_p.data(), p, sizeof(int) * n);
+  queue.wait_and_throw();
+  sycl::free(p, queue);
+
+  int64_t sum        = 0;
+  int64_t sum_expect = 0;
+  for (int i = 0; i < n; i++) {
+    sum += h_p[i];
+    sum_expect += 5 + i;
+  }
 
-#undef OPENMP_RANDOM_XORSHIFT64
-#undef OPENMP_RANDOM_XORSHIFT1024
+  ASSERT_EQ(sum, sum_expect);
+}
 }  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
-#endif
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c12c5c07295d73ddb0600d366f9c50faa6ba96df
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
@@ -0,0 +1,120 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
+
+#include <array>
+
+namespace Test {
+
+// Test whether external allocations can be accessed by the default queue.
+TEST(sycl, raw_sycl_interop_context_1) {
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+  constexpr int n = 100;
+  int* p          = sycl::malloc_device<int>(n, queue);
+
+  Kokkos::Experimental::SYCL space(queue);
+  Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
+  Kokkos::deep_copy(v, 5);
+
+  queue.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<1>(n), [=](int idx) { p[idx] += idx; });
+  });
+  queue.wait_and_throw();
+
+  std::array<int, n> h_p;
+  queue.memcpy(h_p.data(), p, sizeof(int) * n);
+  queue.wait_and_throw();
+  sycl::free(p, queue);
+
+  int64_t sum        = 0;
+  int64_t sum_expect = 0;
+  for (int i = 0; i < n; i++) {
+    sum += h_p[i];
+    sum_expect += 5 + i;
+  }
+
+  ASSERT_EQ(sum, sum_expect);
+}
+
+// Test whether regular View allocations can be accessed by non-default queues.
+TEST(sycl, raw_sycl_interop_context_2) {
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+  constexpr int n = 100;
+
+  Kokkos::Experimental::SYCL space(queue);
+  Kokkos::View<int*, Kokkos::Experimental::SYCLDeviceUSMSpace> v("default_view",
+                                                                 n);
+  Kokkos::deep_copy(space, v, 5);
+
+  auto* v_ptr = v.data();
+  queue.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<1>(n), [=](int idx) { v_ptr[idx] += idx; });
+  });
+  queue.wait_and_throw();
+
+  std::array<int, n> h_p;
+  queue.memcpy(h_p.data(), v_ptr, sizeof(int) * n);
+  queue.wait_and_throw();
+
+  int64_t sum        = 0;
+  int64_t sum_expect = 0;
+  for (int i = 0; i < n; i++) {
+    sum += h_p[i];
+    sum_expect += 5 + i;
+  }
+
+  ASSERT_EQ(sum, sum_expect);
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f81b7073392cc192318187e2ac31aa632f428489
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestSYCL_Category.hpp>
+#include <Test_InterOp_Streams.hpp>
+
+namespace Test {
+// Test Interoperability with SYCL Streams
+TEST(sycl, raw_sycl_queues) {
+  sycl::default_selector device_selector;
+  sycl::queue queue(device_selector);
+  Kokkos::InitArguments arguments{-1, -1, -1, false};
+  Kokkos::initialize(arguments);
+  int* p            = sycl::malloc_device<int>(100, queue);
+  using MemorySpace = typename TEST_EXECSPACE::memory_space;
+
+  {
+    TEST_EXECSPACE space0(queue);
+    Kokkos::View<int*, TEST_EXECSPACE> v(p, 100);
+    Kokkos::deep_copy(space0, v, 5);
+    int sum = 0;
+
+    Kokkos::parallel_for("Test::sycl::raw_sycl_queue::Range",
+                         Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100),
+                         FunctorRange<MemorySpace>(v));
+    Kokkos::parallel_reduce("Test::sycl::raw_sycl_queue::RangeReduce",
+                            Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100),
+                            FunctorRangeReduce<MemorySpace>(v), sum);
+    space0.fence();
+    ASSERT_EQ(6 * 100, sum);
+
+    Kokkos::parallel_for("Test::sycl::raw_sycl_queue::MDRange",
+                         Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
+                             space0, {0, 0}, {10, 10}),
+                         FunctorMDRange<MemorySpace>(v));
+    space0.fence();
+    Kokkos::parallel_reduce(
+        "Test::sycl::raw_sycl_queue::MDRangeReduce",
+        Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space0, {0, 0},
+                                                               {10, 10}),
+        FunctorMDRangeReduce<MemorySpace>(v), sum);
+    space0.fence();
+    ASSERT_EQ(7 * 100, sum);
+
+    Kokkos::parallel_for("Test::sycl::raw_sycl_queue::Team",
+                         Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10),
+                         FunctorTeam<MemorySpace, TEST_EXECSPACE>(v));
+    space0.fence();
+    Kokkos::parallel_reduce("Test::sycl::raw_sycl_queue::Team",
+                            Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10),
+                            FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v),
+                            sum);
+    space0.fence();
+    ASSERT_EQ(8 * 100, sum);
+  }
+  Kokkos::finalize();
+
+  // Try to use the queue after Kokkos' copy got out-of-scope.
+  // This kernel corresponds to "offset_streams" in the HIP and CUDA tests.
+  queue.submit([&](sycl::handler& cgh) {
+    cgh.parallel_for(sycl::range<1>(100), [=](int idx) { p[idx] += idx; });
+  });
+  queue.wait_and_throw();
+
+  int h_p[100];
+  queue.memcpy(h_p, p, sizeof(int) * 100);
+  queue.wait_and_throw();
+  int64_t sum        = 0;
+  int64_t sum_expect = 0;
+  for (int i = 0; i < 100; i++) {
+    sum += h_p[i];
+    sum_expect += 8 + i;
+  }
+
+  ASSERT_EQ(sum, sum_expect);
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp b/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp
deleted file mode 100644
index 800772b42dd0d3523d3e9243917ae005665e0bed..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TEST_THREADS_HPP
-#define KOKKOS_TEST_THREADS_HPP
-
-#include <gtest/gtest.h>
-
-#define TEST_CATEGORY threads
-#define TEST_CATEGORY_DEATH threads_DeathTest
-#define TEST_EXECSPACE Kokkos::Threads
-
-#endif
diff --git a/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp b/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp
index 7e37816c5dc26286cb267dba327de205a75e3ecf..7ee8d68e30dd1de252866ff83c4aed8e07bd2ab5 100644
--- a/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp
+++ b/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp
@@ -46,11 +46,12 @@
 // testing library this tests that our shared-library loading based profiling
 // mechanisms work
 
-#include <iostream>
 #include <Kokkos_Core.hpp>
+#include <iostream>
+#include <sstream>
 
-int main() {
-  Kokkos::initialize();
+int main(int argc, char** argv) {
+  Kokkos::initialize(argc, argv);
   {
     // This test only uses host kernel launch mechanisms. This is to allow for
     // the test to run on platforms where CUDA lambda launch isn't supported.
@@ -84,6 +85,7 @@ int main() {
     Kokkos::Profiling::stopSection(sectionId);
     Kokkos::Profiling::destroyProfileSection(sectionId);
     Kokkos::Profiling::markEvent("profiling_event");
+    Kokkos::Tools::declareMetadata("dogs", "good");
   }
   Kokkos::finalize();
 }
diff --git a/packages/kokkos/core/unit_test/tools/printing-tool.cpp b/packages/kokkos/core/unit_test/tools/printing-tool.cpp
index c2abada0a921b4a7b403fdd49ef3a6837cc47b58..76b7837d0365306201c83eb8e2ae92523d3a6670 100644
--- a/packages/kokkos/core/unit_test/tools/printing-tool.cpp
+++ b/packages/kokkos/core/unit_test/tools/printing-tool.cpp
@@ -4,6 +4,15 @@
 
 struct Kokkos_Profiling_KokkosPDeviceInfo;
 
+// just get the basename for print_help/parse_args
+std::string get_basename(char* cmd, int idx = 0) {
+  if (idx > 0) return cmd;
+  std::string _cmd = cmd;
+  auto _pos        = _cmd.find_last_of('/');
+  if (_pos != std::string::npos) return _cmd.substr(_pos + 1);
+  return _cmd;
+}
+
 struct SpaceHandle {
   char name[64];
 };
@@ -23,6 +32,16 @@ extern "C" void kokkosp_finalize_library() {
   std::cout << "kokkosp_finalize_library::";
 }
 
+extern "C" void kokkosp_print_help(char* exe) {
+  std::cout << "kokkosp_print_help:" << get_basename(exe) << "::";
+}
+
+extern "C" void kokkosp_parse_args(int argc, char** argv) {
+  std::cout << "kokkosp_parse_args:" << argc;
+  for (int i = 0; i < argc; ++i) std::cout << ":" << get_basename(argv[i], i);
+  std::cout << "::";
+}
+
 extern "C" void kokkosp_begin_parallel_for(const char* name,
                                            const uint32_t devID,
                                            uint64_t* kID) {
@@ -116,3 +135,6 @@ extern "C" void kokkosp_destroy_profile_section(uint32_t sec_id) {
 extern "C" void kokkosp_profile_event(const char* name) {
   std::cout << "kokkosp_profile_event:" << name << "::";
 }
+extern "C" void kokkosp_declare_metadata(const char* key, const char* value) {
+  std::cout << "kokkosp_declare_metadata:" << key << ":" << value << "::";
+}
diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df16774e742e9f60a116a5a8dcdf93bcc17b0606
--- /dev/null
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Kokkos minimally requires 3.16 right now,
+# but your project can set it higher
+cmake_minimum_required(VERSION 3.16)
+
+# Projects can safely mix languages - must have C++ support
+# Kokkos flags will only apply to C++ files
+project(Example CXX Fortran)
+
+# You need this for using Kokkos_ROOT variable
+message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+cmake_policy(SET CMP0074 NEW)
+
+# Look for an installed Kokkos but force using the compiler launcher
+# to ensure that targets depending on Kokkos use the same compiler
+# as when kokkos was installed, e.g. if kokkos was built with
+# g++ and the CMAKE_CXX_COMPILER=clang++ then example_with_kokkos
+# will be compiled and linked with g++ whereas example_no_kokkos
+# will be compiled and linked with clang++
+find_package(Kokkos REQUIRED COMPONENTS launch_compiler)
+
+add_executable(example_no_kokkos bar.cpp)
+add_executable(example_with_kokkos foo.cpp)
+
+# This is the only thing required to set up compiler/linker flags
+target_link_libraries(example_with_kokkos Kokkos::kokkos)
+
+enable_testing()
+add_test(NAME KokkosLauncher_NoKokkos_Verify COMMAND example_no_kokkos 10)
+add_test(NAME KokkosLauncher_WithKokkos_Verify COMMAND example_with_kokkos 10)
diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e02c2b8c688650fe3c5e0beefb5ea1ce01de2fa8
--- /dev/null
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp
@@ -0,0 +1,7 @@
+
+#include <cstdio>
+
+int main() {
+  puts("hello world!");
+  return 0;
+}
diff --git a/packages/kokkos/algorithms/unit_tests/TestHIP.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
similarity index 63%
rename from packages/kokkos/algorithms/unit_tests/TestHIP.cpp
rename to packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
index 5e5ccb6a2eb8e988986eaa4ce06e34cf028bf6ed..fc10366f71bd9b0d421b18e935c2cea86925904b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestHIP.cpp
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
@@ -42,42 +42,52 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_ENABLE_HIP
+#include <Kokkos_Core.hpp>
+#include <cstdio>
 
-#include <cstdint>
-#include <iostream>
-#include <iomanip>
+struct CountFunctor {
+  KOKKOS_FUNCTION void operator()(const long i, long& lcount) const {
+    lcount += (i % 2) == 0;
+  }
+};
 
-#include <gtest/gtest.h>
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
 
-#include <Kokkos_Core.hpp>
+  if (argc < 2) {
+    fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]);
+    Kokkos::finalize();
+    exit(1);
+  }
 
-#include <TestRandom.hpp>
-#include <TestSort.hpp>
+  const long n = strtol(argv[1], nullptr, 10);
 
-namespace Test {
+  printf("Number of even integers from 0 to %ld\n", n - 1);
 
-void hip_test_random_xorshift64(size_t num_draws) {
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HIP>>(
-      num_draws);
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Device<
-      Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>>>(num_draws);
-}
+  Kokkos::Timer timer;
+  timer.reset();
 
-void hip_test_random_xorshift1024(size_t num_draws) {
-  Impl::test_random<
-      Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HIP>>(num_draws);
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Device<
-      Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>>>(num_draws);
-}
+  // Compute the number of even integers from 0 to n-1, in parallel.
+  long count = 0;
+  CountFunctor functor;
+  Kokkos::parallel_reduce(n, functor, count);
+
+  double count_time = timer.seconds();
+  printf("  Parallel: %ld    %10.6f\n", count, count_time);
+
+  timer.reset();
+
+  // Compare to a sequential loop.
+  long seq_count = 0;
+  for (long i = 0; i < n; ++i) {
+    seq_count += (i % 2) == 0;
+  }
+
+  count_time = timer.seconds();
+  printf("Sequential: %ld    %10.6f\n", seq_count, count_time);
+
+  Kokkos::finalize();
 
-TEST(hip, Random_XorShift64) { hip_test_random_xorshift64(132141141); }
-TEST(hip, Random_XorShift1024_0) { hip_test_random_xorshift1024(52428813); }
-TEST(hip, SortUnsigned) {
-  Impl::test_sort<Kokkos::Experimental::HIP, unsigned>(171);
+  return (count == seq_count) ? 0 : -1;
 }
-}  // namespace Test
-#else
-void KOKKOS_ALGORITHMS_UNITTESTS_TESTHIP_PREVENT_LINK_ERROR() {}
-#endif /* #ifdef KOKKOS_ENABLE_HIP */
diff --git a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp
index bdb630a1ad9a1b77c91989272c65ab84218afcdf..5810e0ee7a267c79a823d00516937b8ccd43c0d9 100644
--- a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp
+++ b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp
@@ -85,7 +85,14 @@ struct hello_world {
   // (as well as on the host).  If not building with CUDA, the macro
   // is unnecessary but harmless.
   KOKKOS_INLINE_FUNCTION
-  void operator()(const int i) const { printf("Hello from i = %i\n", i); }
+  void operator()(const int i) const {
+    // FIXME_SYCL needs workaround for printf
+#ifndef __SYCL_DEVICE_ONLY__
+    printf("Hello from i = %i\n", i);
+#else
+    (void)i;
+#endif
+  }
 };
 
 int main(int argc, char* argv[]) {
diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
index c3bd00e7d40adbfd808643e939d58647a346b3b0..06f209774eae10a4a11161d17aae979450d6e850 100644
--- a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
+++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -104,8 +104,13 @@ int main(int argc, char* argv[]) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for(
       15, KOKKOS_LAMBDA(const int i) {
+  // FIXME_SYCL needs workaround for printf
+#ifndef __SYCL_DEVICE_ONLY__
         // printf works in a CUDA parallel kernel; std::ostream does not.
         printf("Hello from i = %i\n", i);
+#else
+	(void)i;
+#endif
       });
 #endif
   // You must call finalize() after you are done using Kokkos.
diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
index caacc828e5075fa6a179f8d9b0a99a31a29fc8aa..32b18e4d2047c5f3dcc23614109f4440b4686549 100644
--- a/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
+++ b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
@@ -122,7 +122,7 @@ int main() {
     // Fill the 'data' array on the host with random numbers.  We assume
     // that they come from some process which is only implemented on the
     // host, via some library.  (That's true in this case.)
-    for (size_type i = 0; i < data.extent(0); ++i) {
+    for (size_type i = 0; i < static_cast<size_type>(data.extent(0)); ++i) {
       h_data(i) = rand() % nnumbers;
     }
     Kokkos::deep_copy(data, h_data);  // copy from host to device
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
index ca9f0bf8da93f918cb07e825fb36f35c97619c11..2a6c3f6c27a3699d0715c8e5ab41448221432aaf 100644
--- a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
@@ -7,4 +7,3 @@ KOKKOS_ADD_EXECUTABLE(
   tutorial_06_simple_mdrangepolicy
   SOURCES simple_mdrangepolicy.cpp
 )
-
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index 643ac87a86168e29d11332251a53efbc817ea9f5..597d1e3056ece9ef5865a3fb79dfef09ccf50a6a 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -68,13 +68,15 @@ struct init_view {
   ViewType a;
   init_view(ViewType a_) : a(a_) {}
 
+  using size_type = typename ViewType::size_type;
+
   KOKKOS_INLINE_FUNCTION
   void operator()(const typename ViewType::size_type i) const {
     // On CPUs this loop could be vectorized so j should do stride 1
     // access on a for optimal performance. I.e. a should be LayoutRight.
     // On GPUs threads should do coalesced loads and stores. That means
     // that i should be the stride one access for optimal performance.
-    for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
+    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
       a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j;
     }
   }
@@ -95,6 +97,8 @@ struct contraction {
   contraction(view_type a_, ViewType1 v1_, ViewType2 v2_)
       : a(a_), v1(v1_), v2(v2_) {}
 
+  using size_type = typename view_type::size_type;
+
   // As with the initialization functor the performance of this operator
   // depends on the architecture and the chosen data layouts.
   // On CPUs optimal would be to vectorize the inner loop, so j should be the
@@ -104,7 +108,7 @@ struct contraction {
   // LayoutLeft and v2 LayoutRight.
   KOKKOS_INLINE_FUNCTION
   void operator()(const view_type::size_type i) const {
-    for (view_type::size_type j = 0; j < v1.extent(1); ++j) {
+    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
       a(i) = v1(i, j) * v2(j, i);
     }
   }
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index cff215d0ebf9581cc3de646287bad432046e75f7..00bfeea36b972e6ea08ab8c82ec5aaca1a4e2af5 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -113,8 +113,9 @@ int main(int narg, char* arg[]) {
 
     srand(134231);
 
+    using size_type = view_type::size_type;
     for (int i = 0; i < size; i++) {
-      for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
+      for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) {
         h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
       }
     }
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
index ca2eeac41682a5629d3e66903474b78fa96d851c..20e5c5a284f415e7627fd07df20ffbe5856f3428 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -78,9 +78,11 @@ struct set_boundary {
 
   set_boundary(ViewType a_, double value_) : a(a_), value(value_) {}
 
+  using size_type = typename ViewType::size_type;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(const typename ViewType::size_type i) const {
-    for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
+  void operator()(const size_type i) const {
+    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
       a(i, j) = value;
     }
   }
@@ -96,11 +98,12 @@ struct set_inner {
 
   set_inner(ViewType a_, double value_) : a(a_), value(value_) {}
 
+  using size_type = typename ViewType::size_type;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(const typename ViewType::size_type i) const {
-    using size_type = typename ViewType::size_type;
-    for (size_type j = 0; j < a.extent(1); ++j) {
-      for (size_type k = 0; k < a.extent(2); ++k) {
+  void operator()(const size_type i) const {
+    for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) {
+      for (size_type k = 0; k < static_cast<size_type>(a.extent(2)); ++k) {
         a(i, j, k) = value;
       }
     }
@@ -116,12 +119,13 @@ struct update {
 
   update(ViewType a_, const double dt_) : a(a_), dt(dt_) {}
 
+  using size_type = typename ViewType::size_type;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(typename ViewType::size_type i) const {
-    using size_type = typename ViewType::size_type;
+  void operator()(size_type i) const {
     i++;
-    for (size_type j = 1; j < a.extent(1) - 1; j++) {
-      for (size_type k = 1; k < a.extent(2) - 1; k++) {
+    for (size_type j = 1; j < static_cast<size_type>(a.extent(1) - 1); j++) {
+      for (size_type k = 1; k < static_cast<size_type>(a.extent(2) - 1); k++) {
         a(i, j, k) += dt * (a(i, j, k + 1) - a(i, j, k - 1) + a(i, j + 1, k) -
                             a(i, j - 1, k) + a(i + 1, j, k) - a(i - 1, j, k));
       }
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 174d13d102a337bda707accaa915547aa97d488d..3c0fcd085c7c2afe29a328dfa3f574ab9ac81276 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -175,8 +175,9 @@ int main(int narg, char* arg[]) {
     // Get a reference to the host view of idx directly (equivalent to
     // idx.view<idx_type::host_mirror_space>() )
     idx_type::t_host h_idx = idx.h_view;
+    using size_type        = view_type::size_type;
     for (int i = 0; i < size; ++i) {
-      for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
+      for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) {
         h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size;
       }
     }
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
index 9afc144752284288704ff9223c52a9261ba7a0df..735de65e056c84a5290105db39d5369a50f16ec7 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
@@ -75,8 +75,13 @@ struct hello_world {
     // The TeamPolicy<>::member_type provides functions to query the multi
     // dimensional index of a thread as well as the number of thread-teams and
     // the size of each team.
+#ifndef __SYCL_DEVICE_ONLY__
+    // FIXME_SYCL needs printf workaround
     printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
            thread.team_rank(), thread.league_size(), thread.team_size());
+#else
+    (void)thread;
+#endif
   }
 };
 
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
index a182b08b8439d61d1a25fee5e8798ea56f761c0c..dcb1e0561bca8b096b528d61128f85c6254c221c 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -85,11 +85,16 @@ int main(int narg, char* args[]) {
       policy,
       KOKKOS_LAMBDA(const team_member& thread, int& lsum) {
         lsum += 1;
-        // TeamPolicy<>::member_type provides functions to query the
-        // multidimensional index of a thread, as well as the number of
-        // thread teams and the size of each team.
+    // TeamPolicy<>::member_type provides functions to query the
+    // multidimensional index of a thread, as well as the number of
+    // thread teams and the size of each team.
+#ifndef __SYCL_DEVICE_ONLY__
+        // FIXME_SYCL needs workaround for printf
         printf("Hello World: %i %i // %i %i\n", thread.league_rank(),
                thread.team_rank(), thread.league_size(), thread.team_size());
+#else
+        (void)thread;
+#endif
       },
       sum);
 #endif
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
index 29e23e904c545e2f4258cf0e462d4315ff9edfdd..a528b71fe33f817b03dc32bacdbe8cd96271eab7 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
@@ -73,8 +73,13 @@ struct hello_world {
     // also executed by all threads of the team.
     Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31),
                          [&](const int& i) {
+#ifndef __SYCL_DEVICE_ONLY__
+                           // FIXME_SYCL needs printf workaround
                            printf("Hello World: (%i , %i) executed loop %i \n",
                                   thread.league_rank(), thread.team_rank(), i);
+#else
+			   (void) i;
+#endif
                          });
   }
 };
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index 144ed92608f2574ed067abb92d6bdbaf3a89b751..e9871b436971a551c82751756b2b18de9175839a 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -146,6 +146,7 @@ display_help_text() {
       echo "--with-cuda[=/Path/To/Cuda]:          Enable Cuda and set path to Cuda Toolkit."
       echo "--with-hip[=/Path/To/Hip]:            Enable Hip and set path to ROCM Toolkit."
       echo "--with-openmptarget:                  Enable OpenMPTarget backend."
+      echo "--with-sycl:                          Enable Sycl backend."
       echo "--with-openmp:                        Enable OpenMP backend."
       echo "--with-pthread:                       Enable Pthreads backend."
       echo "--with-serial:                        Enable Serial backend."
@@ -159,7 +160,7 @@ display_help_text() {
       echo "               [AMD: GPU]"
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
-      echo "                 VEGA908         = AMD GPU"
+      echo "                 VEGA908         = AMD GPU MI100 GFX908"
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -199,7 +200,7 @@ display_help_text() {
       echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
       echo "                                build.  This will still set certain required"
       echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
-      echo "                                --std=c++14, etc.)."
+      echo "                                -std=c++14, etc.)."
       echo "--cxxstandard=[FLAGS]         Set CMAKE_CXX_STANDARD for library build and test"
       echo "                                c++14 (default), c++17, c++1y, c++1z, c++2a"
       echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
@@ -285,6 +286,9 @@ do
     --with-openmp)
       update_kokkos_devices OpenMP
       ;;
+    --with-sycl)
+      update_kokkos_devices Sycl
+      ;;
     --with-pthread)
       update_kokkos_devices Pthread
       ;;
@@ -356,7 +360,7 @@ do
       ;;
     --compiler*)
       COMPILER="${key#*=}"
-      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l)
+      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}")
       if [ ${CNUM} -gt 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
@@ -365,7 +369,7 @@ do
         echo "Empty compiler specified by --compiler command."
         exit
       fi
-      CNUM=$(command -v ${COMPILER} | grep ${COMPILER} | wc -l)
+      CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER})
       if [ ${CNUM} -eq 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash
index 20ad18bd29a0a1c5529a572daf7490a08e63b320..ea509669f068d677a0354c83891d7caf298b1e34 100755
--- a/packages/kokkos/gnu_generate_makefile.bash
+++ b/packages/kokkos/gnu_generate_makefile.bash
@@ -94,7 +94,7 @@ do
       ;;
     --compiler*)
       COMPILER="${key#*=}"
-      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l)
+      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}")
       if [ ${CNUM} -gt 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
@@ -103,7 +103,7 @@ do
         echo "Empty compiler specified by --compiler command."
         exit
       fi
-      CNUM=$(command -v ${COMPILER} | grep ${COMPILER} | wc -l)
+      CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER})
       if [ ${CNUM} -eq 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
@@ -174,7 +174,7 @@ do
       echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
       echo "                                build.  This will still set certain required"
       echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
-      echo "                                --std=c++14, etc.)."
+      echo "                                -std=c++14, etc.)."
       echo "--cxxstandard=[FLAGS]         Overwrite KOKKOS_CXX_STANDARD for library build and test"
       echo "                                c++14 (default), c++17, c++1y, c++1z, c++2a"
       echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index e746bd7d0103b0cb2e813290304f8f90c9b35f72..7a58f593d00e424b7d7dcbda226f5c4c6d7ccd3c 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -23,3 +23,4 @@ tag:  3.1.01     date: 05:04:2020    master: 785d19f2    release: 2be028bc
 tag:  3.2.00     date: 08:19:2020    master: 3b2fdc7e    release: 5dc6d303
 tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
+tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang
index 8d1a95b8bafe5b04188b0c77192060fca2aa0e5f..6aaf75fae55ff975df5045bb73a0813236871d89 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.clang
+++ b/packages/kokkos/scripts/docker/Dockerfile.clang
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.gcc b/packages/kokkos/scripts/docker/Dockerfile.gcc
index fd37305f9c20d69c0dfe319f0365db81dd03cf16..56972d3185d0f62e6b9effb64e8f2cedefe25c66 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.gcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.gcc
@@ -1,6 +1,6 @@
 FROM gcc:5.3.0
 
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.hipcc b/packages/kokkos/scripts/docker/Dockerfile.hipcc
index dddd09ae44c00514c0f1955e9b432f4be8c813f9..d3b6b93a023396aa785703a5aeec0c4001af34e8 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.hipcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.hipcc
@@ -2,6 +2,7 @@ ARG BASE=rocm/dev-ubuntu-20.04:3.8
 FROM $BASE
 
 RUN apt-get update && apt-get install -y \
+        git \
         kmod \
         wget \
         ccache \
@@ -12,7 +13,7 @@ RUN apt-get update && apt-get install -y \
 
 ENV PATH=/opt/rocm/bin:$PATH
 
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
index ce4ffaa0429021803e9653504205c9d8e33b9c86..5d53a645e4bc7c551698719d3edb1c3768467ca7 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
+++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
@@ -11,7 +11,7 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc
index 868f9be1c4d7afbf3bbb18994eadd1650517dff8..e17accc0663980694821b8002b976277fcd9ca42 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc
@@ -5,6 +5,7 @@ ARG ADDITIONAL_PACKAGES
 
 RUN apt-get update && apt-get install -y \
         bc \
+        git \
         wget \
         ccache \
         $ADDITIONAL_PACKAGES \
@@ -12,7 +13,7 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index 331270491f9150fab8d31ec02f26a84193e5cd2f..fdcd6d01fb8e3158000aa1507bb5bfcf7e0d9b4e 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -1,16 +1,18 @@
-ARG BASE=intel/oneapi-basekit:devel-ubuntu18.04
+ARG BASE=nvidia/cuda:10.2-devel
 FROM $BASE
 
 RUN apt-get update && apt-get install -y \
+        bc \
+        git \
         wget \
         ccache \
+        ninja-build \
+        python3 \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH
-
-ARG CMAKE_VERSION=3.10.3
+ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
     CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
@@ -26,3 +28,20 @@ RUN CMAKE_KEY=2D2CEF1034921684 && \
     sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \
     rm cmake*
 ENV PATH=${CMAKE_DIR}/bin:$PATH
+
+ENV SYCL_DIR=/opt/sycl
+RUN SYCL_VERSION=20210311 && \
+    SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
+    SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
+    SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
+    wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \
+    mkdir llvm && \
+    tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \
+    cd llvm && \
+    python3 buildbot/configure.py --cuda && \
+    python3 buildbot/compile.py && \
+    mkdir -p ${SYCL_DIR} && \
+    mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \
+    echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \
+    rm -rf ${SCRATCH_DIR}
+ENV PATH=${SYCL_DIR}/bin:$PATH
diff --git a/packages/kokkos/scripts/spack_test/CMakeLists.txt b/packages/kokkos/scripts/spack_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c28bd0b8eccff2487ae1388960bbbc6b8504a34
--- /dev/null
+++ b/packages/kokkos/scripts/spack_test/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.16)
+project(SpackTestGen)
+set(TEST_LIST_DEF ${CMAKE_CURRENT_SOURCE_DIR}/test_list.def)
+file(STRINGS ${TEST_LIST_DEF} TEST_FILES)
+
+#Copy test source to Spack test directory
+foreach (TEST_FILE ${TEST_FILES})
+  set(TEST_FILE_LOCATION ${SPACK_PACKAGE_SOURCE_DIR}/${TEST_FILE})
+  file(COPY ${TEST_FILE_LOCATION} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/out)
+endforeach()
+
+#Clean up names
+foreach(TEST_FILE ${TEST_FILES} )
+  string( REGEX REPLACE ".+\/" "" TEST_FILE ${TEST_FILE} )
+  list(APPEND SRC_NAME_LIST ${TEST_FILE})
+  string( REPLACE ".cpp" "" TEST_FILE ${TEST_FILE} )
+  list(APPEND BIN_NAME_LIST ${TEST_FILE})
+endforeach()
+
+#Configure test cmake script and run script
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt.in ${CMAKE_CURRENT_SOURCE_DIR}/out/CMakeLists.txt @ONLY)
diff --git a/packages/kokkos/scripts/spack_test/CMakeLists.txt.in b/packages/kokkos/scripts/spack_test/CMakeLists.txt.in
new file mode 100644
index 0000000000000000000000000000000000000000..4a216df4aab7b326efc94866b0f943af7c42d29f
--- /dev/null
+++ b/packages/kokkos/scripts/spack_test/CMakeLists.txt.in
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.16) 
+project(kokkos_spack_test CXX)
+find_package(Kokkos REQUIRED)
+
+set(SRC_NAME_LIST "@SRC_NAME_LIST@")
+set(BIN_NAME_LIST "@BIN_NAME_LIST@")
+
+enable_testing()
+list(LENGTH SRC_NAME_LIST LEN) 
+math(EXPR LEN "${LEN}-1")
+
+set(CMAKE_CXX_COMPILER ${Kokkos_CXX_COMPILER})
+
+foreach (it RANGE ${LEN}) 
+  list(GET SRC_NAME_LIST ${it} src) 
+  list(GET BIN_NAME_LIST ${it} bin)
+  add_executable(${bin} ${src})
+  target_link_libraries(${bin} Kokkos::kokkos)
+  add_test(NAME ${bin} COMMAND ${bin})
+  set_tests_properties(${bin} PROPERTIES
+    LABELS "Kokkos"
+    PROCESSORS 1
+    TIMEOUT 60)
+endforeach()
diff --git a/packages/kokkos/scripts/spack_test/test_list.def b/packages/kokkos/scripts/spack_test/test_list.def
new file mode 100644
index 0000000000000000000000000000000000000000..8703ccb9854140245f5ff684b85eb32c6881b207
--- /dev/null
+++ b/packages/kokkos/scripts/spack_test/test_list.def
@@ -0,0 +1,4 @@
+example/tutorial/01_hello_world/hello_world.cpp
+example/tutorial/02_simple_reduce/simple_reduce.cpp
+example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
diff --git a/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp b/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp
index f2d33eb26cb8e131974cab8ce6693bc00f0dda10..883e88b51b7dd6c3f116ea8731934db5b7dde72a 100644
--- a/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp
+++ b/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp
@@ -43,5 +43,5 @@
 //@HEADER
 */
 
-#include <exec/TestEXEC_Category.hpp>
+#include <TestEXEC_Category.hpp>
 #include <TestTEST.hpp>
diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
index cd767975115684e19db41f3af5392eed566be4ec..f21124ed6e716844e876cf209ee2af5cb9a7dbbd 100755
--- a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
+++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash
@@ -86,7 +86,7 @@ do
       ;;
     --compiler*)
       COMPILER="${key#*=}"
-      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l)
+      CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}")
       if [ ${CNUM} -gt 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
@@ -95,7 +95,7 @@ do
         echo "Empty compiler specified by --compiler command."
         exit
       fi
-      CNUM=$(command -v ${COMPILER} | grep ${COMPILER} | wc -l)
+      CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER})
       if [ ${CNUM} -eq 0 ]; then
         echo "Invalid compiler by --compiler command: '${COMPILER}'"
         exit
@@ -166,7 +166,7 @@ do
       echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
       echo "                                build.  This will still set certain required"
       echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
-      echo "                                --std=c++14, etc.)."
+      echo "                                -std=c++14, etc.)."
       echo "--cxxstandard=[FLAGS]         Overwrite KOKKOS_CXX_STANDARD for library build and test"
       echo "                                c++14 (default), c++17, c++1y, c++1z, c++2a"
       echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia
index 578d2992a3840fa7a87d22c29603246061c150a6..877b35b73e1aef7c64cdb2d7e5f00f7bc235781c 100755
--- a/packages/kokkos/scripts/testing_scripts/test_all_sandia
+++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia
@@ -112,6 +112,10 @@ if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then
   MACHINE=kokkos-dev
 fi
 
+if [[ "$HOSTNAME" == sogpu01* ]]; then
+  MACHINE=sogpu
+fi
+
 if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
   if [[ "$MACHINE" = "" ]]; then
     MACHINE=sems
@@ -269,9 +273,9 @@ if [ "$MACHINE" = "sems" ]; then
   # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing
   # Remove kokkkos-env module use
 
-  module load sems-cmake/3.12.2
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
+  module load sems-cmake/3.17.1
+  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
   SKIP_HWLOC=True
   # No sems hwloc module
 
@@ -304,15 +308,47 @@ if [ "$MACHINE" = "sems" ]; then
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
+elif [ "$MACHINE" = "sogpu" ]; then
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
+
+  module load sems-cmake/3.17.1 sems-git
+  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
+  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/8.3.0"
+  SKIP_HWLOC=True
+  # No sems hwloc module
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=Volta70"
+  fi
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "cuda/10.1 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+              )
 elif [ "$MACHINE" = "kokkos-dev" ]; then
   source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
-  module load sems-cmake/3.12.2
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
-  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
-  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
-  CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-cuda/9.2"
+  module load sems-cmake/3.17.1
+  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
+  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
+  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
+  CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-cuda/9.2"
   SKIP_HWLOC=True
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -354,10 +390,10 @@ elif [ "$MACHINE" = "white" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=32
 
-  BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
-  IBM_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
-  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
+  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+  IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
+  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
+  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
 
   # Don't do pthread with Power
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
@@ -372,7 +408,8 @@ elif [ "$MACHINE" = "white" ]; then
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+    COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@@ -391,10 +428,10 @@ elif [ "$MACHINE" = "weaver" ]; then
   source /etc/profile.d/modules.sh
   SKIP_HWLOC=True
 
-  BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
-  IBM_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
-  CUDA10_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
+  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+  IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
+  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
+  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
 
   # Don't do pthread with Power
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
@@ -430,7 +467,7 @@ elif [ "$MACHINE" = "voltrino" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=32
 
-  BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/9.3.0"
+  BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/9.3.0"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
   COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
@@ -446,11 +483,12 @@ elif [ "$MACHINE" = "mayer" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=96
 
-  BASE_MODULE_LIST="cmake/3.14.5,<COMPILER_NAME>/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   # Format: (compiler module-list build-list exe-name warning-flag)
   COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "arm/20.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS")
+             "gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS")
 
   if [ -z "$ARCH_FLAG" ]; then
     ARCH_FLAG="--arch=ARMV8_THUNDERX2"
@@ -461,10 +499,12 @@ elif [ "$MACHINE" = "blake" ]; then
   SKIP_HWLOC=True
   export SLURM_TASKS_PER_NODE=32
 
-  module load cmake/3.12.3
+  module load cmake/3.19.3
 
-  BASE_MODULE_LIST="cmake/3.12.3,<COMPILER_NAME>/<COMPILER_VERSION>"
-  BASE_MODULE_LIST_INTEL="cmake/3.12.3,<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+  BASE_MODULE_LIST_INTEL="cmake/3.19.3,<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,<COMPILER_NAME>/oneAPI/base-toolkit/<COMPILER_VERSION>"
+  ONEAPI_WARNING_FLAGS=""
 
   if [ "$SPOT_CHECK" = "True" ]; then
 
@@ -479,12 +519,14 @@ elif [ "$MACHINE" = "blake" ]; then
              "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
              "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
              "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS"
              "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
              "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
   )
 
   fi
@@ -498,18 +540,18 @@ elif [ "$MACHINE" = "apollo" ]; then
 
   module load sems-git
   module load sems-tex
-  module load sems-cmake/3.12.2
+  module load sems-cmake/3.17.1
   module load sems-gdb
   module load binutils
 
   SKIP_HWLOC=True
 
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-  CUDA10X_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  CUDA10X_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
 
-  HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils"
+  HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils"
 
   BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
   BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
@@ -548,19 +590,19 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
 
   module load sems-git
   module load sems-tex
-  module load sems-cmake/3.12.2
+  module load sems-cmake/3.17.1
   module load sems-gdb
 
   SKIP_HWLOC=True
 
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-  NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
-  NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
+  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
+  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
 
-  CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
-  PGI_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
+  PGI_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread"
   BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP"
@@ -768,8 +810,8 @@ setup_env() {
   done
 
   if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then
-     echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE"
-     source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE
+     echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler"
+     source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler
   fi
   return 0
 }
@@ -851,8 +893,12 @@ single_build_and_test() {
     echo "        \$KOKKOS_PATH/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh
 
     run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    local make_par_lvl=12
+    if [[ "$MACHINE" = white* ]]; then
+      make_par_lvl=48
+    fi
     local -i build_start_time=$(date +%s)
-    run_cmd make -j 48 all >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    run_cmd make -j  $make_par_lvl all >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
     local -i build_end_time=$(date +%s)
     comment="build_time=$(($build_end_time-$build_start_time))"
 
diff --git a/packages/kokkos/scripts/testing_scripts/update_lib.sh b/packages/kokkos/scripts/testing_scripts/update_lib.sh
index 47f9745759637b5c7fad45af06ce15595bf3afd1..34ab5dd3c9a0afae4b10b70d99772308f35b3f9f 100755
--- a/packages/kokkos/scripts/testing_scripts/update_lib.sh
+++ b/packages/kokkos/scripts/testing_scripts/update_lib.sh
@@ -1,30 +1,53 @@
 #!/bin/bash
 
-if [ "$1" = blake ]; then
-  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
-  if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then
-    module swap gcc/4.9.3 gcc/6.4.0
-    module list
-  fi
-fi
-if [ "$1" = kokkos-dev ]; then
+local machine_input="$1"
+local compiler_input="$2"
+
+check_sems_intel() {
   ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
   if [[ "${ICPCVER}" = 17.* ]]; then
     module swap sems-gcc/4.9.3 sems-gcc/6.4.0
     module list
   fi
-fi
-if [ "$1" = kokkos-dev-2 ]; then
-  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
-  if [[ "${ICPCVER}" = 17.* ]]; then
-    module swap sems-gcc/4.9.3 sems-gcc/6.4.0
+  if [[ "${ICPCVER}" = 19.* ]]; then
+    # Newer gcc needed for c++ standard beyond c++14
+    module swap sems-gcc/6.1.0 sems-gcc/7.2.0
     module list
   fi
-fi
-if [ "$1" = sems ]; then
+}
+
+check_sems_clang() {
+  CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3)
+  if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then
+    # Newer gcc needed for c++ standard beyond c++14
+    module swap sems-gcc/5.3.0 sems-gcc/6.4.0
+    module list
+  fi
+}
+
+check_compiler_modules() {
+  if [[ "$compiler_input" = clang/* ]]; then
+    echo "  clang compiler - check supporting modules"
+    check_sems_clang
+  elif [[ "$compiler_input" = intel/* ]]; then
+    echo "  intel compiler - check supporting modules"
+    check_sems_intel
+  fi
+}
+
+if [ "$machine_input" = blake ]; then
   ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
-  if [[ "${ICPCVER}" = 17.* ]]; then
-    module swap sems-gcc/4.9.3 sems-gcc/6.4.0
+  if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then
+    module swap gcc/4.9.3 gcc/6.4.0
     module list
   fi
 fi
+if [ "$machine_input" = kokkos-dev ]; then
+  check_compiler_modules
+fi
+if [ "$machine_input" = kokkos-dev-2 ]; then
+  check_compiler_modules
+fi
+if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then
+  check_compiler_modules
+fi