From be756b7e799fd01ef3e3bf3eaa4a671ccc23df65 Mon Sep 17 00:00:00 2001
From: Stephane Del Pino <stephane.delpino44@gmail.com>
Date: Thu, 22 Sep 2022 21:22:19 +0200
Subject: [PATCH] git subrepo pull (merge) packages/kokkos

subrepo:
  subdir:   "packages/kokkos"
  merged:   "61d7db55f"
upstream:
  origin:   "git@github.com:kokkos/kokkos.git"
  branch:   "master"
  commit:   "61d7db55f"
git-subrepo:
  version:  "0.4.3"
  origin:   "git@github.com:ingydotnet/git-subrepo.git"
  commit:   "2f68596"
---
 CMakeLists.txt                                |    1 +
 .../continuous-integration-workflow-hpx.yml   |   88 +
 .../continuous-integration-workflow.yml       |   29 +-
 packages/kokkos/.github/workflows/osx.yml     |    1 +
 packages/kokkos/.gitignore                    |    1 +
 packages/kokkos/.gitrepo                      |    4 +-
 packages/kokkos/.jenkins                      |   70 +-
 packages/kokkos/BUILD.md                      |   20 +-
 packages/kokkos/CHANGELOG.md                  |  152 +
 packages/kokkos/CMakeLists.txt                |   31 +-
 packages/kokkos/Makefile.kokkos               |  248 +-
 packages/kokkos/Makefile.targets              |   25 +-
 packages/kokkos/README.md                     |  262 +-
 packages/kokkos/algorithms/src/CMakeLists.txt |    1 +
 .../kokkos/algorithms/src/Kokkos_Random.hpp   |  125 +-
 .../kokkos/algorithms/src/Kokkos_Sort.hpp     |   94 +-
 .../algorithms/src/Kokkos_StdAlgorithms.hpp   |  126 +-
 .../Kokkos_AdjacentDifference.hpp             |   99 +-
 .../std_algorithms/Kokkos_AdjacentFind.hpp    |  124 +
 .../src/std_algorithms/Kokkos_AllOf.hpp       |   94 +
 .../src/std_algorithms/Kokkos_AnyOf.hpp       |   94 +
 .../src/std_algorithms/Kokkos_BeginEnd.hpp    |    4 +-
 .../src/std_algorithms/Kokkos_Copy.hpp        |   97 +
 .../std_algorithms/Kokkos_CopyBackward.hpp    |   95 +
 .../src/std_algorithms/Kokkos_CopyIf.hpp      |   99 +
 .../src/std_algorithms/Kokkos_CopyN.hpp       |   98 +
 .../src/std_algorithms/Kokkos_Count.hpp       |   94 +
 .../src/std_algorithms/Kokkos_CountIf.hpp     |   99 +
 .../src/std_algorithms/Kokkos_Distance.hpp    |    4 +-
 .../src/std_algorithms/Kokkos_Equal.hpp       |  198 ++
 .../std_algorithms/Kokkos_ExclusiveScan.hpp   |  190 ++
 .../src/std_algorithms/Kokkos_Fill.hpp        |   86 +
 .../src/std_algorithms/Kokkos_FillN.hpp       |   91 +
 .../src/std_algorithms/Kokkos_Find.hpp        |   89 +
 .../src/std_algorithms/Kokkos_FindEnd.hpp     |  149 +
 .../src/std_algorithms/Kokkos_FindFirstOf.hpp |  150 +
 .../src/std_algorithms/Kokkos_FindIf.hpp      |   95 +
 .../src/std_algorithms/Kokkos_FindIfNot.hpp   |   98 +
 .../src/std_algorithms/Kokkos_ForEach.hpp     |   95 +
 .../src/std_algorithms/Kokkos_ForEachN.hpp    |   96 +
 .../src/std_algorithms/Kokkos_Generate.hpp    |   91 +
 .../src/std_algorithms/Kokkos_GenerateN.hpp   |   93 +
 .../std_algorithms/Kokkos_InclusiveScan.hpp   |  223 ++
 .../std_algorithms/Kokkos_IsPartitioned.hpp   |   92 +
 .../src/std_algorithms/Kokkos_IsSorted.hpp    |  131 +
 .../std_algorithms/Kokkos_IsSortedUntil.hpp   |  134 +
 ...yingOperations.hpp => Kokkos_IterSwap.hpp} |   31 +-
 .../Kokkos_LexicographicalCompare.hpp         |  154 +
 .../src/std_algorithms/Kokkos_MaxElement.hpp  |  132 +
 .../src/std_algorithms/Kokkos_MinElement.hpp  |  132 +
 .../std_algorithms/Kokkos_MinMaxElement.hpp   |  133 +
 .../Kokkos_MinMaxElementOperations.hpp        |  409 ---
 .../src/std_algorithms/Kokkos_Mismatch.hpp    |  160 +
 .../src/std_algorithms/Kokkos_Move.hpp        |   94 +
 .../std_algorithms/Kokkos_MoveBackward.hpp    |   95 +
 .../Kokkos_NonModifyingSequenceOperations.hpp | 2406 ---------------
 .../src/std_algorithms/Kokkos_NoneOf.hpp      |   94 +
 .../std_algorithms/Kokkos_PartitionCopy.hpp   |  110 +
 .../std_algorithms/Kokkos_PartitionPoint.hpp  |   91 +
 .../Kokkos_PartitioningOperations.hpp         |  491 ---
 .../{numeric => }/Kokkos_Reduce.hpp           |  113 +-
 .../src/std_algorithms/Kokkos_Remove.hpp      |   91 +
 .../src/std_algorithms/Kokkos_RemoveCopy.hpp  |  106 +
 .../std_algorithms/Kokkos_RemoveCopyIf.hpp    |  110 +
 .../src/std_algorithms/Kokkos_RemoveIf.hpp    |   92 +
 .../src/std_algorithms/Kokkos_Replace.hpp     |   93 +
 .../src/std_algorithms/Kokkos_ReplaceCopy.hpp |  107 +
 .../std_algorithms/Kokkos_ReplaceCopyIf.hpp   |  111 +
 .../src/std_algorithms/Kokkos_ReplaceIf.hpp   |   96 +
 .../src/std_algorithms/Kokkos_Reverse.hpp     |   87 +
 .../src/std_algorithms/Kokkos_ReverseCopy.hpp |   95 +
 .../src/std_algorithms/Kokkos_Rotate.hpp      |   89 +
 .../src/std_algorithms/Kokkos_RotateCopy.hpp  |  100 +
 .../src/std_algorithms/Kokkos_Search.hpp      |  148 +
 .../src/std_algorithms/Kokkos_SearchN.hpp     |  144 +
 .../src/std_algorithms/Kokkos_ShiftLeft.hpp   |   89 +
 .../src/std_algorithms/Kokkos_ShiftRight.hpp  |   89 +
 .../Kokkos_SortingOperations.hpp              |  378 ---
 .../src/std_algorithms/Kokkos_Swap.hpp        |   69 +
 .../src/std_algorithms/Kokkos_SwapRanges.hpp  |   97 +
 .../src/std_algorithms/Kokkos_Transform.hpp   |  166 +
 .../Kokkos_TransformExclusiveScan.hpp         |  131 +
 .../Kokkos_TransformInclusiveScan.hpp         |  190 ++
 .../{numeric => }/Kokkos_TransformReduce.hpp  |  213 +-
 .../src/std_algorithms/Kokkos_Unique.hpp      |  124 +
 .../src/std_algorithms/Kokkos_UniqueCopy.hpp  |  143 +
 .../impl/Kokkos_AdjacentDifference.hpp        |  135 +
 .../impl/Kokkos_AdjacentFind.hpp              |  140 +
 .../impl/Kokkos_AllOfAnyOfNoneOf.hpp          |   77 +
 .../{ => impl}/Kokkos_Constraints.hpp         |    0
 .../impl/Kokkos_CopyBackward.hpp              |  103 +
 .../std_algorithms/impl/Kokkos_CopyCopyN.hpp  |  116 +
 .../src/std_algorithms/impl/Kokkos_CopyIf.hpp |  142 +
 .../impl/Kokkos_CountCountIf.hpp              |  112 +
 .../src/std_algorithms/impl/Kokkos_Equal.hpp  |  147 +
 .../impl/Kokkos_ExclusiveScan.hpp             |  232 ++
 .../std_algorithms/impl/Kokkos_FillFillN.hpp} |  112 +-
 .../std_algorithms/impl/Kokkos_FindEnd.hpp    |  191 ++
 .../impl/Kokkos_FindFirstOf.hpp               |  161 +
 .../impl/Kokkos_FindIfOrNot.hpp               |  146 +
 .../impl/Kokkos_ForEachForEachN.hpp           |  113 +
 .../impl/Kokkos_GenerateGenerateN.hpp         |  105 +
 .../{ => impl}/Kokkos_HelperPredicates.hpp    |    4 +-
 .../Kokkos_IdentityReferenceUnaryFunctor.hpp  |    4 +-
 .../impl/Kokkos_InclusiveScan.hpp             |  243 ++
 .../impl/Kokkos_IsPartitioned.hpp             |  148 +
 .../std_algorithms/impl/Kokkos_IsSorted.hpp   |  117 +
 .../impl/Kokkos_IsSortedUntil.hpp             |  153 +
 .../impl/Kokkos_LexicographicalCompare.hpp    |  184 ++
 .../impl/Kokkos_MinMaxMinmaxElement.hpp       |  167 +
 .../std_algorithms/impl/Kokkos_Mismatch.hpp   |  162 +
 .../src/std_algorithms/impl/Kokkos_Move.hpp   |  100 +
 .../impl/Kokkos_MoveBackward.hpp              |  104 +
 .../impl/Kokkos_PartitionCopy.hpp             |  180 ++
 .../impl/Kokkos_PartitionPoint.hpp            |  132 +
 .../Kokkos_RandomAccessIterator.hpp           |   24 +-
 .../src/std_algorithms/impl/Kokkos_Reduce.hpp |  186 ++
 ...cerWithArbitraryJoinerNoNeutralElement.hpp |   11 +-
 .../impl/Kokkos_RemoveAllVariants.hpp         |  212 ++
 .../std_algorithms/impl/Kokkos_Replace.hpp    |  103 +
 .../impl/Kokkos_ReplaceCopy.hpp               |  122 +
 .../impl/Kokkos_ReplaceCopyIf.hpp             |  123 +
 .../std_algorithms/impl/Kokkos_ReplaceIf.hpp  |  105 +
 .../std_algorithms/impl/Kokkos_Reverse.hpp    |  111 +
 .../impl/Kokkos_ReverseCopy.hpp               |  102 +
 .../src/std_algorithms/impl/Kokkos_Rotate.hpp |  219 ++
 .../std_algorithms/impl/Kokkos_RotateCopy.hpp |  149 +
 .../src/std_algorithms/impl/Kokkos_Search.hpp |  191 ++
 .../std_algorithms/impl/Kokkos_SearchN.hpp    |  205 ++
 .../std_algorithms/impl/Kokkos_ShiftLeft.hpp  |  139 +
 .../std_algorithms/impl/Kokkos_ShiftRight.hpp |  139 +
 .../std_algorithms/impl/Kokkos_SwapRanges.hpp |  112 +
 .../std_algorithms/impl/Kokkos_Transform.hpp  |  158 +
 .../impl/Kokkos_TransformExclusiveScan.hpp    |  153 +
 .../impl/Kokkos_TransformInclusiveScan.hpp    |  235 ++
 .../impl/Kokkos_TransformReduce.hpp           |  245 ++
 .../src/std_algorithms/impl/Kokkos_Unique.hpp |  193 ++
 .../std_algorithms/impl/Kokkos_UniqueCopy.hpp |  156 +
 ...Kokkos_ValueWrapperForNoNeutralElement.hpp |   10 +-
 ...Kokkos_ModifyingSequenceOperationsSet1.hpp | 1285 --------
 ...Kokkos_ModifyingSequenceOperationsSet2.hpp | 1783 -----------
 .../numeric/Kokkos_ExclusiveScan.hpp          |  517 ----
 .../numeric/Kokkos_InclusiveScan.hpp          |  699 -----
 .../algorithms/unit_tests/CMakeLists.txt      |    1 +
 .../algorithms/unit_tests/TestRandom.hpp      |   20 +-
 .../unit_tests/TestRandomAccessIterator.cpp   |   39 +-
 .../kokkos/algorithms/unit_tests/TestSort.hpp |   13 +-
 .../TestStdAlgorithmsAdjacentDifference.cpp   |   12 +-
 .../TestStdAlgorithmsAdjacentFind.cpp         |    4 +-
 .../TestStdAlgorithmsAllAnyNoneOf.cpp         |    4 +-
 .../unit_tests/TestStdAlgorithmsCommon.hpp    |   68 +-
 .../TestStdAlgorithmsCompileOnly.cpp          |   19 -
 .../TestStdAlgorithmsConstraints.cpp          |    2 +-
 .../unit_tests/TestStdAlgorithmsCopyIf.cpp    |   71 +-
 .../unit_tests/TestStdAlgorithmsCount.cpp     |    2 -
 .../unit_tests/TestStdAlgorithmsEqual.cpp     |    3 -
 .../TestStdAlgorithmsExclusiveScan.cpp        |   58 +-
 .../unit_tests/TestStdAlgorithmsFind.cpp      |    2 -
 .../unit_tests/TestStdAlgorithmsFindEnd.cpp   |   10 +-
 .../TestStdAlgorithmsFindFirstOf.cpp          |   10 +-
 .../unit_tests/TestStdAlgorithmsForEach.cpp   |    2 -
 .../TestStdAlgorithmsHelperFunctors.hpp       |    6 -
 .../TestStdAlgorithmsInclusiveScan.cpp        |   58 +-
 .../unit_tests/TestStdAlgorithmsIsSorted.cpp  |    4 +-
 .../TestStdAlgorithmsIsSortedUntil.cpp        |   20 +-
 ...estStdAlgorithmsLexicographicalCompare.cpp |    5 +-
 .../TestStdAlgorithmsMinMaxElementOps.cpp     |   23 +-
 .../unit_tests/TestStdAlgorithmsMismatch.cpp  |   18 +-
 .../unit_tests/TestStdAlgorithmsModOps.cpp    |   45 +-
 .../unit_tests/TestStdAlgorithmsModSeqOps.cpp |   42 +-
 .../TestStdAlgorithmsMoveBackward.cpp         |  135 +
 .../unit_tests/TestStdAlgorithmsNumerics.cpp  |  155 +-
 .../TestStdAlgorithmsPartitionCopy.cpp        |   38 +-
 .../TestStdAlgorithmsPartitioningOps.cpp      |    7 +-
 .../unit_tests/TestStdAlgorithmsRemove.cpp    |    6 +-
 .../TestStdAlgorithmsRemoveCopy.cpp           |    6 +-
 .../TestStdAlgorithmsRemoveCopyIf.cpp         |    6 +-
 .../unit_tests/TestStdAlgorithmsRemoveIf.cpp  |    6 +-
 .../unit_tests/TestStdAlgorithmsReplace.cpp   |   26 +-
 .../TestStdAlgorithmsReplaceCopy.cpp          |   58 +-
 .../TestStdAlgorithmsReplaceCopyIf.cpp        |   58 +-
 .../unit_tests/TestStdAlgorithmsReplaceIf.cpp |    4 +-
 .../unit_tests/TestStdAlgorithmsReverse.cpp   |    4 +-
 .../unit_tests/TestStdAlgorithmsRotate.cpp    |    6 +-
 .../TestStdAlgorithmsRotateCopy.cpp           |   12 +-
 .../TestStdAlgorithmsScalarRedVsView.cpp      |  235 --
 .../unit_tests/TestStdAlgorithmsSearch.cpp    |   10 +-
 .../unit_tests/TestStdAlgorithmsSearch_n.cpp  |   10 +-
 .../unit_tests/TestStdAlgorithmsShiftLeft.cpp |    6 +-
 .../TestStdAlgorithmsShiftRight.cpp           |    6 +-
 ...estStdAlgorithmsTransformExclusiveScan.cpp |   22 +-
 ...estStdAlgorithmsTransformInclusiveScan.cpp |   22 +-
 .../TestStdAlgorithmsTransformUnaryOp.cpp     |    4 +-
 .../unit_tests/TestStdAlgorithmsUnique.cpp    |   12 +-
 .../TestStdAlgorithmsUniqueCopy.cpp           |   72 +-
 .../algorithms/unit_tests/TestStdReducers.cpp |    4 +-
 packages/kokkos/appveyor.yml                  |    2 +-
 .../benchmarks/bytes_and_flops/bench.hpp      |   36 +-
 .../bytes_and_flops/bench_double.cpp          |    2 +-
 .../bytes_and_flops/bench_float.cpp           |    2 +-
 .../bytes_and_flops/bench_int32_t.cpp         |    2 +-
 .../bytes_and_flops/bench_int64_t.cpp         |    2 +-
 .../bytes_and_flops/bench_stride.hpp          |   51 +-
 .../bytes_and_flops/bench_unroll_stride.hpp   |   90 +-
 .../benchmarks/bytes_and_flops/main.cpp       |   42 +-
 packages/kokkos/bin/nvcc_wrapper              |   75 +-
 packages/kokkos/cmake/Dependencies.cmake      |   12 +
 .../kokkos/cmake/KokkosConfigCommon.cmake.in  |    1 +
 packages/kokkos/cmake/KokkosCore_config.h.in  |   11 +-
 .../kokkos/cmake/Modules/FindTPLLIBDL.cmake   |    2 +-
 .../kokkos/cmake/Modules/FindTPLTHREADS.cmake |    2 +-
 packages/kokkos/cmake/fake_tribits.cmake      |    2 -
 packages/kokkos/cmake/kokkos_arch.cmake       |  278 +-
 .../kokkos/cmake/kokkos_compiler_id.cmake     |   14 +-
 .../kokkos/cmake/kokkos_enable_devices.cmake  |    2 +
 .../kokkos/cmake/kokkos_enable_options.cmake  |    1 +
 packages/kokkos/cmake/kokkos_functions.cmake  |   78 +-
 .../kokkos/cmake/kokkos_pick_cxx_std.cmake    |    4 +-
 .../kokkos/cmake/kokkos_test_cxx_std.cmake    |    4 +-
 packages/kokkos/cmake/kokkos_tpls.cmake       |    5 +
 packages/kokkos/cmake/kokkos_tribits.cmake    |    4 +-
 .../performance_tests/TestGlobal2LocalIds.hpp |    4 +-
 .../performance_tests/TestScatterView.hpp     |    6 +-
 .../TestUnorderedMapPerformance.hpp           |    2 +-
 .../kokkos/containers/src/Kokkos_Bitset.hpp   |   16 +-
 .../kokkos/containers/src/Kokkos_DualView.hpp |  220 +-
 .../containers/src/Kokkos_DynRankView.hpp     | 1058 ++++---
 .../containers/src/Kokkos_DynamicView.hpp     |  498 ++-
 .../containers/src/Kokkos_ErrorReporter.hpp   |    8 +
 .../containers/src/Kokkos_Functional.hpp      |   82 +-
 .../containers/src/Kokkos_OffsetView.hpp      |  675 ++--
 .../containers/src/Kokkos_ScatterView.hpp     |  122 +-
 .../containers/src/Kokkos_StaticCrsGraph.hpp  |   14 +-
 .../containers/src/Kokkos_UnorderedMap.hpp    |   42 +-
 .../kokkos/containers/src/Kokkos_Vector.hpp   |   11 +-
 .../src/impl/Kokkos_Bitset_impl.hpp           |    4 +-
 .../impl/Kokkos_StaticCrsGraph_factory.hpp    |   18 +-
 .../src/impl/Kokkos_UnorderedMap_impl.cpp     |    4 +
 .../src/impl/Kokkos_UnorderedMap_impl.hpp     |   11 +-
 .../containers/unit_tests/TestBitset.hpp      |   12 +-
 .../containers/unit_tests/TestDualView.hpp    |   17 +-
 .../containers/unit_tests/TestDynViewAPI.hpp  |   48 +-
 .../containers/unit_tests/TestOffsetView.hpp  |    8 +-
 .../containers/unit_tests/TestScatterView.hpp |   60 +-
 .../unit_tests/TestUnorderedMap.hpp           |   10 +-
 .../containers/unit_tests/TestVector.hpp      |   13 +
 .../unit_tests/TestWithoutInitializing.hpp    |  552 ++++
 packages/kokkos/core/perf_test/CMakeLists.txt |    3 +
 .../core/perf_test/PerfTestBlasKernels.hpp    |    6 +-
 .../core/perf_test/PerfTestGramSchmidt.cpp    |    2 +-
 packages/kokkos/core/src/CMakeLists.txt       |   54 +-
 .../kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp |   88 +-
 .../Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp    |  125 +-
 .../core/src/Cuda/Kokkos_Cuda_Error.hpp       |   31 +-
 .../src/Cuda/Kokkos_Cuda_Half_Conversion.hpp  |   90 +-
 .../core/src/Cuda/Kokkos_Cuda_Instance.cpp    |  261 +-
 .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp     |    2 +-
 .../core/src/Cuda/Kokkos_Cuda_Locks.cpp       |   11 +-
 .../core/src/Cuda/Kokkos_Cuda_Locks.hpp       |   58 +-
 .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp    | 2722 -----------------
 .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp |  477 +++
 .../src/Cuda/Kokkos_Cuda_Parallel_Range.hpp   | 1049 +++++++
 .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp    | 1139 +++++++
 .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp  |  563 ++--
 .../kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp |    4 +
 .../kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp |    9 +-
 .../kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp |  627 ++--
 .../src/Cuda/Kokkos_Cuda_Vectorization.hpp    |   27 +-
 .../kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp |   31 +-
 .../src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp  |   10 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp  |   22 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp |   89 +-
 .../HIP/Kokkos_HIP_BlockSize_Deduction.hpp    |    6 +-
 .../core/src/HIP/Kokkos_HIP_Instance.cpp      |   59 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp  |    4 +
 .../src/HIP/Kokkos_HIP_Parallel_MDRange.hpp   |   67 +-
 .../src/HIP/Kokkos_HIP_Parallel_Range.hpp     |  201 +-
 .../core/src/HIP/Kokkos_HIP_Parallel_Team.hpp |  146 +-
 .../core/src/HIP/Kokkos_HIP_ReduceScan.hpp    |  270 +-
 .../src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp     |  174 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Space.cpp  |  265 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Team.hpp   |  122 +-
 .../core/src/HIP/Kokkos_HIP_UniqueToken.hpp   |    3 +-
 .../core/src/HIP/Kokkos_HIP_Vectorization.hpp |   22 +-
 .../src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp    |   10 +-
 packages/kokkos/core/src/HPX/Kokkos_HPX.cpp   |   81 +-
 .../kokkos/core/src/HPX/Kokkos_HPX_Task.cpp   |    4 +
 .../kokkos/core/src/HPX/Kokkos_HPX_Task.hpp   |    6 +-
 .../src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp    |    8 +-
 .../kokkos/core/src/KokkosExp_InterOp.hpp     |    8 +
 .../core/src/KokkosExp_MDRangePolicy.hpp      |   11 +-
 .../src/Kokkos_AcquireUniqueTokenImpl.hpp     |    9 +
 .../kokkos/core/src/Kokkos_AnonymousSpace.hpp |    9 +
 packages/kokkos/core/src/Kokkos_Array.hpp     |  107 +-
 packages/kokkos/core/src/Kokkos_Atomic.hpp    |    8 +
 .../core/src/Kokkos_Atomics_Desul_Config.hpp  |    9 +
 .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp |   10 +
 .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp |   10 +
 packages/kokkos/core/src/Kokkos_Complex.hpp   |  229 +-
 packages/kokkos/core/src/Kokkos_Concepts.hpp  |   41 +-
 packages/kokkos/core/src/Kokkos_CopyViews.hpp | 1212 +++++---
 packages/kokkos/core/src/Kokkos_Core.hpp      |  250 +-
 packages/kokkos/core/src/Kokkos_Core_fwd.hpp  |   34 +-
 packages/kokkos/core/src/Kokkos_Crs.hpp       |   12 +-
 packages/kokkos/core/src/Kokkos_Cuda.hpp      |   43 +-
 packages/kokkos/core/src/Kokkos_CudaSpace.hpp |  107 +-
 .../kokkos/core/src/Kokkos_DetectionIdiom.hpp |    8 +
 .../kokkos/core/src/Kokkos_ExecPolicy.hpp     |   56 +-
 packages/kokkos/core/src/Kokkos_Extents.hpp   |   16 +-
 packages/kokkos/core/src/Kokkos_Future.hpp    |   90 +-
 packages/kokkos/core/src/Kokkos_Graph.hpp     |    8 +
 packages/kokkos/core/src/Kokkos_GraphNode.hpp |   34 +-
 packages/kokkos/core/src/Kokkos_Graph_fwd.hpp |    8 +
 packages/kokkos/core/src/Kokkos_HBWSpace.hpp  |    9 +
 packages/kokkos/core/src/Kokkos_HIP.hpp       |    9 +
 packages/kokkos/core/src/Kokkos_HIP_Space.hpp |  226 +-
 packages/kokkos/core/src/Kokkos_HPX.hpp       |  604 ++--
 packages/kokkos/core/src/Kokkos_Half.hpp      |    8 +
 packages/kokkos/core/src/Kokkos_HostSpace.hpp |   56 +-
 packages/kokkos/core/src/Kokkos_Layout.hpp    |   61 +-
 .../kokkos/core/src/Kokkos_LogicalSpaces.hpp  |   23 +-
 packages/kokkos/core/src/Kokkos_Macros.hpp    |   65 +-
 .../kokkos/core/src/Kokkos_MasterLock.hpp     |    9 +
 .../core/src/Kokkos_MathematicalConstants.hpp |    8 +
 .../core/src/Kokkos_MathematicalFunctions.hpp |  140 +-
 .../Kokkos_MathematicalSpecialFunctions.hpp   |   62 +-
 .../kokkos/core/src/Kokkos_MemoryPool.hpp     |    9 +
 .../kokkos/core/src/Kokkos_MemoryTraits.hpp   |    9 +
 .../kokkos/core/src/Kokkos_MinMaxClamp.hpp    |   18 +-
 .../kokkos/core/src/Kokkos_NumericTraits.hpp  |    8 +
 packages/kokkos/core/src/Kokkos_OpenMP.hpp    |   61 +-
 .../kokkos/core/src/Kokkos_OpenMPTarget.hpp   |   50 +-
 .../core/src/Kokkos_OpenMPTargetSpace.hpp     |   44 +-
 packages/kokkos/core/src/Kokkos_Pair.hpp      |   29 +-
 packages/kokkos/core/src/Kokkos_Parallel.hpp  |  213 +-
 .../core/src/Kokkos_Parallel_Reduce.hpp       |  403 +--
 .../core/src/Kokkos_PointerOwnership.hpp      |    9 +
 .../src/Kokkos_Profiling_ProfileSection.hpp   |    8 +
 packages/kokkos/core/src/Kokkos_Rank.hpp      |    9 +
 packages/kokkos/core/src/Kokkos_SYCL.hpp      |   76 +-
 .../kokkos/core/src/Kokkos_SYCL_Space.hpp     |   69 +
 .../kokkos/core/src/Kokkos_ScratchSpace.hpp   |    9 +
 packages/kokkos/core/src/Kokkos_Serial.hpp    |  984 +-----
 .../kokkos/core/src/Kokkos_TaskScheduler.hpp  |   69 +-
 .../core/src/Kokkos_TaskScheduler_fwd.hpp     |    9 +
 packages/kokkos/core/src/Kokkos_Threads.hpp   |   51 +-
 packages/kokkos/core/src/Kokkos_Timer.hpp     |    8 +
 packages/kokkos/core/src/Kokkos_Tuners.hpp    |    9 +
 .../kokkos/core/src/Kokkos_UniqueToken.hpp    |    9 +
 .../kokkos/core/src/Kokkos_Vectorization.hpp  |    9 +
 packages/kokkos/core/src/Kokkos_View.hpp      | 1180 ++++---
 .../core/src/Kokkos_WorkGraphPolicy.hpp       |   11 +-
 packages/kokkos/core/src/Kokkos_hwloc.hpp     |    9 +
 .../core/src/OpenACC/Kokkos_OpenACC.cpp       |   98 +
 .../core/src/OpenACC/Kokkos_OpenACC.hpp       |  126 +
 .../core/src/OpenACC/Kokkos_OpenACCSpace.cpp  |  222 ++
 .../core/src/OpenACC/Kokkos_OpenACCSpace.hpp  |  249 ++
 .../src/OpenACC/Kokkos_OpenACC_Instance.cpp   |  118 +
 .../src/OpenACC/Kokkos_OpenACC_Instance.hpp   |   79 +
 .../src/OpenACC/Kokkos_OpenACC_Traits.hpp     |   65 +
 ...MP_Exec.cpp => Kokkos_OpenMP_Instance.cpp} |  233 +-
 ...MP_Exec.hpp => Kokkos_OpenMP_Instance.hpp} |  121 +-
 .../src/OpenMP/Kokkos_OpenMP_Parallel.hpp     |  562 ++--
 .../core/src/OpenMP/Kokkos_OpenMP_Task.cpp    |    4 +
 .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp    |   17 +-
 .../core/src/OpenMP/Kokkos_OpenMP_Team.hpp    |    2 +-
 .../OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp  |    4 +-
 .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp |  108 +-
 .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp |    9 +-
 .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp |  210 +-
 .../Kokkos_OpenMPTarget_Instance.cpp          |   85 +-
 .../Kokkos_OpenMPTarget_Instance.hpp          |    2 +-
 .../Kokkos_OpenMPTarget_Parallel.hpp          |  292 +-
 .../Kokkos_OpenMPTarget_Parallel_MDRange.hpp  |  261 +-
 .../OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp |    4 +
 packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp |  115 +-
 .../core/src/SYCL/Kokkos_SYCL_Instance.cpp    |   23 +-
 .../core/src/SYCL/Kokkos_SYCL_Instance.hpp    |   20 +-
 .../src/SYCL/Kokkos_SYCL_Parallel_Range.hpp   |   10 +-
 .../src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp  |  322 +-
 .../src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp    |   77 +-
 .../src/SYCL/Kokkos_SYCL_Parallel_Team.hpp    |  393 ++-
 .../core/src/SYCL/Kokkos_SYCL_Space.cpp       |  136 +-
 .../kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp |  134 +-
 .../src/{impl => Serial}/Kokkos_Serial.cpp    |   72 +-
 .../Serial/Kokkos_Serial_Parallel_MDRange.hpp |  213 ++
 .../Serial/Kokkos_Serial_Parallel_Range.hpp   |  337 ++
 .../Serial/Kokkos_Serial_Parallel_Team.hpp    |  424 +++
 .../{impl => Serial}/Kokkos_Serial_Task.cpp   |   11 +-
 .../{impl => Serial}/Kokkos_Serial_Task.hpp   |    6 +-
 .../src/Serial/Kokkos_Serial_UniqueToken.hpp  |  109 +
 .../Kokkos_Serial_WorkGraphPolicy.hpp         |    4 +-
 .../core/src/Threads/Kokkos_ThreadsExec.cpp   |  137 +-
 .../core/src/Threads/Kokkos_ThreadsExec.hpp   |  195 +-
 .../core/src/Threads/Kokkos_ThreadsTeam.hpp   |  158 +-
 .../src/Threads/Kokkos_Threads_Parallel.hpp   | 1034 -------
 .../Kokkos_Threads_Parallel_MDRange.hpp       |  322 ++
 .../Threads/Kokkos_Threads_Parallel_Range.hpp |  485 +++
 .../Threads/Kokkos_Threads_Parallel_Team.hpp  |  279 ++
 .../Threads/Kokkos_Threads_UniqueToken.hpp    |  157 +
 .../Kokkos_Threads_WorkGraphPolicy.hpp        |    4 +-
 .../core/src/View/Hooks/Kokkos_ViewHooks.hpp  |  151 +
 .../core/src/decl/Kokkos_Declare_CUDA.hpp     |    5 +-
 .../src/decl/Kokkos_Declare_OPENACC.hpp}      |   11 +-
 .../kokkos/core/src/desul/atomics/CUDA.hpp    |  541 ----
 .../desul/atomics/Compare_Exchange_SYCL.hpp   |  227 --
 .../kokkos/core/src/desul/atomics/SYCL.hpp    |  116 -
 .../kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp    |    1 +
 .../core/src/fwd/Kokkos_Fwd_OPENACC.hpp       |   56 +
 packages/kokkos/core/src/impl/CMakeLists.txt  |    2 +-
 .../src/impl/KokkosExp_Host_IterateTile.hpp   |   79 +-
 .../src/impl/KokkosExp_IterateTileGPU.hpp     |    3 -
 .../core/src/impl/Kokkos_AnalyzePolicy.hpp    |   15 +
 .../Kokkos_Atomic_Compare_Exchange_Strong.hpp |   64 +-
 .../Kokkos_Atomic_Compare_Exchange_Weak.hpp   |   28 +-
 .../core/src/impl/Kokkos_Atomic_Exchange.hpp  |   82 +-
 .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp |   53 +-
 .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp |   34 +-
 .../core/src/impl/Kokkos_Atomic_Generic.hpp   |   37 +-
 .../core/src/impl/Kokkos_Atomic_Load.hpp      |   36 +-
 .../core/src/impl/Kokkos_Atomic_Store.hpp     |   36 +-
 .../core/src/impl/Kokkos_Atomic_Windows.hpp   |   14 +-
 .../kokkos/core/src/impl/Kokkos_BitOps.hpp    |  190 +-
 .../core/src/impl/Kokkos_CPUDiscovery.cpp     |    4 +
 .../kokkos/core/src/impl/Kokkos_ChaseLev.hpp  |    4 +-
 .../kokkos/core/src/impl/Kokkos_ClockTic.hpp  |   32 +-
 .../core/src/impl/Kokkos_Combined_Reducer.hpp |  115 +-
 .../src/impl/Kokkos_Command_Line_Parsing.cpp  |  265 +-
 .../src/impl/Kokkos_Command_Line_Parsing.hpp  |   18 +-
 packages/kokkos/core/src/impl/Kokkos_Core.cpp | 1130 +++----
 ...alizer.hpp => Kokkos_DeviceManagement.hpp} |   27 +-
 packages/kokkos/core/src/impl/Kokkos_EBO.hpp  |   44 +-
 .../kokkos/core/src/impl/Kokkos_Error.cpp     |    7 +
 .../kokkos/core/src/impl/Kokkos_Error.hpp     |  100 +-
 .../core/src/impl/Kokkos_ExecPolicy.cpp       |   14 +-
 .../core/src/impl/Kokkos_ExecSpaceManager.hpp |  162 +
 .../core/src/impl/Kokkos_FunctorAdapter.hpp   | 2055 -------------
 .../core/src/impl/Kokkos_FunctorAnalysis.hpp  |  481 ++-
 .../kokkos/core/src/impl/Kokkos_HBWSpace.cpp  |    4 +
 .../core/src/impl/Kokkos_HostBarrier.cpp      |    4 +
 .../kokkos/core/src/impl/Kokkos_HostSpace.cpp |  160 +-
 .../src/impl/Kokkos_HostSpace_deepcopy.cpp    |   10 +-
 .../src/impl/Kokkos_HostSpace_deepcopy.hpp    |    2 +
 .../core/src/impl/Kokkos_HostThreadTeam.cpp   |    4 +
 .../core/src/impl/Kokkos_HostThreadTeam.hpp   |  221 +-
 .../impl/Kokkos_InitializationSettings.hpp    |  195 ++
 packages/kokkos/core/src/impl/Kokkos_LIFO.hpp |    7 +-
 .../core/src/impl/Kokkos_MemoryPool.cpp       |    4 +
 .../src/impl/Kokkos_MemoryPoolAllocator.hpp   |    2 +-
 .../core/src/impl/Kokkos_MemorySpace.cpp      |    4 +
 .../core/src/impl/Kokkos_MemorySpace.hpp      |   14 +
 .../core/src/impl/Kokkos_Memory_Fence.hpp     |    3 +-
 .../src/impl/Kokkos_MultipleTaskQueue.hpp     |   45 +-
 .../core/src/impl/Kokkos_NumericTraits.cpp    |    5 +
 .../core/src/impl/Kokkos_OptionalRef.hpp      |   14 +-
 ...ndLineArgumentsAndEnvironmentVariables.hpp |   58 +
 .../kokkos/core/src/impl/Kokkos_Profiling.cpp |  161 +-
 .../kokkos/core/src/impl/Kokkos_Profiling.hpp |    8 +-
 .../src/impl/Kokkos_Profiling_Interface.hpp   |    2 +
 .../src/impl/Kokkos_QuadPrecisionMath.hpp     |   78 +-
 .../core/src/impl/Kokkos_SharedAlloc.cpp      |    9 +-
 .../core/src/impl/Kokkos_SharedAlloc.hpp      |   38 +-
 .../src/impl/Kokkos_SimpleTaskScheduler.hpp   |    7 +-
 .../core/src/impl/Kokkos_SingleTaskQueue.hpp  |    1 -
 .../kokkos/core/src/impl/Kokkos_Spinwait.cpp  |    4 +
 .../kokkos/core/src/impl/Kokkos_Spinwait.hpp  |   24 +-
 .../core/src/impl/Kokkos_Stacktrace.cpp       |    5 +
 .../src/impl/Kokkos_StringManipulation.hpp    |  220 ++
 .../kokkos/core/src/impl/Kokkos_TaskBase.hpp  |    5 +-
 .../kokkos/core/src/impl/Kokkos_TaskNode.hpp  |    1 -
 .../kokkos/core/src/impl/Kokkos_TaskQueue.hpp |    1 -
 .../core/src/impl/Kokkos_TaskQueueCommon.hpp  |    1 -
 .../impl/Kokkos_TaskQueueMemoryManager.hpp    |    1 -
 .../src/impl/Kokkos_TaskQueueMultiple.hpp     |    1 -
 .../core/src/impl/Kokkos_TaskResult.hpp       |    1 -
 .../core/src/impl/Kokkos_TaskTeamMember.hpp   |    5 +-
 .../core/src/impl/Kokkos_Tools_Generic.hpp    |   15 +-
 .../kokkos/core/src/impl/Kokkos_Traits.hpp    |   13 +-
 .../kokkos/core/src/impl/Kokkos_Utilities.hpp |    3 +
 .../core/src/impl/Kokkos_VLAEmulation.hpp     |    2 +-
 .../kokkos/core/src/impl/Kokkos_ViewArray.hpp |   88 +-
 .../kokkos/core/src/impl/Kokkos_ViewCtor.hpp  |   33 +-
 .../core/src/impl/Kokkos_ViewLayoutTiled.hpp  |  383 +--
 .../core/src/impl/Kokkos_ViewMapping.hpp      |  373 ++-
 .../core/src/impl/Kokkos_ViewTracker.hpp      |    2 +-
 .../core/src/impl/Kokkos_ViewUniformType.hpp  |    8 +-
 .../kokkos/core/src/impl/Kokkos_hwloc.cpp     |   29 +-
 .../core/src/setup/Kokkos_Setup_Cuda.hpp      |    2 -
 .../core/src/setup/Kokkos_Setup_SYCL.hpp      |    9 +
 .../core/src/traits/Kokkos_WorkTagTrait.hpp   |   11 +-
 packages/kokkos/core/unit_test/CMakeLists.txt |   73 +-
 packages/kokkos/core/unit_test/Makefile       |    5 +-
 packages/kokkos/core/unit_test/TestAbort.hpp  |  138 +
 .../kokkos/core/unit_test/TestAggregate.hpp   |   11 +-
 packages/kokkos/core/unit_test/TestArray.cpp  |   84 +
 .../core/unit_test/TestAtomicOperations.hpp   |    1 -
 .../kokkos/core/unit_test/TestAtomicViews.hpp |    3 +-
 .../kokkos/core/unit_test/TestAtomics.hpp     |   12 +-
 packages/kokkos/core/unit_test/TestCXX11.hpp  |    3 +-
 .../kokkos/core/unit_test/TestComplex.hpp     |   10 +-
 .../core/unit_test/TestConcurrentBitset.hpp   |    1 -
 .../core/unit_test/TestDeepCopyAlignment.hpp  |   12 +-
 .../unit_test/TestDefaultDeviceTypeInit.hpp   |    4 +-
 .../unit_test/TestExecSpacePartitioning.hpp   |    1 -
 .../core/unit_test/TestExecutionSpace.hpp     |   46 +-
 .../core/unit_test/TestFunctorAnalysis.hpp    |   14 +-
 .../core/unit_test/TestHalfOperators.hpp      |    1 -
 .../core/unit_test/TestHostSharedPtr.hpp      |    2 +-
 .../TestHostSharedPtrAccessOnDevice.hpp       |   16 +-
 packages/kokkos/core/unit_test/TestInit.hpp   |    1 -
 .../unit_test/TestInitializationSettings.cpp  |  124 +
 .../kokkos/core/unit_test/TestInterOp.cpp     |   38 +-
 .../core/unit_test/TestIrregularLayout.hpp    |    1 -
 .../TestJoinBackwardCompatibility.hpp         |  154 +
 .../unit_test/TestLegionInteroperability.cpp  |  159 +
 .../core/unit_test/TestLocalDeepCopy.hpp      |    1 -
 .../kokkos/core/unit_test/TestMDRange.hpp     |   28 +-
 .../unit_test/TestMathematicalFunctions.hpp   |  135 +-
 .../unit_test/TestMathematicalFunctions1.hpp  |   47 +
 .../unit_test/TestMathematicalFunctions2.hpp  |   47 +
 .../TestMathematicalSpecialFunctions.hpp      |   20 +-
 .../kokkos/core/unit_test/TestMemoryPool.hpp  |   11 +-
 .../kokkos/core/unit_test/TestMinMaxClamp.hpp |  164 +-
 .../unit_test/TestNonTrivialScalarTypes.hpp   |   81 +-
 .../core/unit_test/TestNumericTraits.hpp      |   76 +-
 packages/kokkos/core/unit_test/TestOther.hpp  |    6 +-
 .../TestParseCmdLineArgsAndEnvVars.cpp        |  499 +++
 .../core/unit_test/TestPolicyConstruction.hpp |   33 +-
 .../core/unit_test/TestQuadPrecisionMath.hpp  |   52 +-
 .../kokkos/core/unit_test/TestRealloc.hpp     |   56 +-
 packages/kokkos/core/unit_test/TestReduce.hpp |   54 +-
 .../unit_test/TestReduceCombinatorical.hpp    |   48 +-
 .../kokkos/core/unit_test/TestReducers.hpp    |   39 +-
 .../kokkos/core/unit_test/TestReducers_d.hpp  |   55 +-
 packages/kokkos/core/unit_test/TestResize.hpp |   64 +-
 packages/kokkos/core/unit_test/TestScan.hpp   |    3 +-
 .../kokkos/core/unit_test/TestSharedAlloc.hpp |    1 -
 .../core/unit_test/TestStringManipulation.cpp |  217 ++
 .../core/unit_test/TestTaskScheduler.hpp      |    1 -
 packages/kokkos/core/unit_test/TestTeam.hpp   |  191 +-
 .../kokkos/core/unit_test/TestTeamBasic.hpp   |   54 +-
 .../core/unit_test/TestTeamReductionScan.hpp  |   45 +-
 .../kokkos/core/unit_test/TestTeamScan.hpp    |   21 +-
 .../core/unit_test/TestTeamTeamSize.hpp       |    9 -
 .../kokkos/core/unit_test/TestTeamVector.hpp  |    7 +-
 .../core/unit_test/TestTeamVectorRange.hpp    |   56 -
 .../unit_test/TestTemplateMetaFunctions.hpp   |  181 --
 .../kokkos/core/unit_test/TestUniqueToken.hpp |   26 +-
 .../kokkos/core/unit_test/TestUtilities.hpp   |    1 -
 .../kokkos/core/unit_test/TestViewAPI.hpp     |  120 +-
 .../kokkos/core/unit_test/TestViewAPI_b.hpp   |    2 +
 .../kokkos/core/unit_test/TestViewAPI_e.hpp   |   38 +-
 .../core/unit_test/TestViewCtorDimMatch.hpp   |  430 +++
 .../kokkos/core/unit_test/TestViewHooks.hpp   |  159 +
 .../core/unit_test/TestViewIsAssignable.hpp   |    4 +-
 .../TestViewLayoutStrideAssignment.hpp        |   21 +-
 .../core/unit_test/TestViewLayoutTiled.hpp    |    6 +
 .../core/unit_test/TestViewMapping_a.hpp      |   21 +-
 .../core/unit_test/TestViewMapping_b.hpp      |    1 -
 .../unit_test/TestViewMapping_subview.hpp     |    1 -
 .../TestViewMemoryAccessViolation.hpp         |  221 ++
 .../kokkos/core/unit_test/TestViewOfClass.hpp |    1 -
 .../core/unit_test/TestViewSpaceAssign.hpp    |    1 -
 .../kokkos/core/unit_test/TestViewSubview.hpp |    5 +-
 .../unit_test/TestWithoutInitializing.hpp     |  265 ++
 .../TestHIPHostPinned_Category.hpp            |    1 +
 .../TestHIPManaged_Category.hpp}              |   17 +-
 .../category_files/TestHIP_Category.hpp       |    1 +
 .../TestSYCLHostUSM_Category.hpp              |    1 +
 .../TestSYCLSharedUSM_Category.hpp            |    1 +
 .../category_files/TestSYCL_Category.hpp      |    1 +
 .../unit_test/cuda/TestCuda_InterOp_Init.cpp  |    3 +-
 .../cuda/TestCuda_InterOp_Streams.cpp         |    3 +-
 .../cuda/TestCuda_ReducerViewSizeLimit.cpp    |    8 -
 .../default/TestDefaultDeviceTypeViewAPI.cpp  |    6 +-
 .../headers_self_contained/tstHeader.cpp      |    2 +
 .../unit_test/hip/TestHIP_InterOp_Init.cpp    |    3 +-
 .../unit_test/hip/TestHIP_InterOp_Streams.cpp |    3 +-
 .../hip/TestHIP_Memory_Requirements.cpp       |   86 +
 .../core/unit_test/hip/TestHIP_ScanUnit.cpp   |    7 +-
 .../core/unit_test/hip/TestHIP_Spaces.cpp     |   96 +
 .../hpx/TestHPX_IndependentInstances.cpp      |  134 +-
 ...X_IndependentInstancesDelayedExecution.cpp |   44 +-
 ...estHPX_IndependentInstancesInstanceIds.cpp |  108 +-
 ...estHPX_IndependentInstancesRefCounting.cpp |   50 +-
 .../core/unit_test/hpx/TestHPX_InterOp.cpp    |    6 +-
 .../incremental/Test01_execspace.hpp          |    6 +-
 .../incremental/Test14_MDRangeReduce.hpp      |    6 -
 .../core/unit_test/openmp/TestOpenMP.hpp      |    1 -
 .../unit_test/openmp/TestOpenMP_InterOp.cpp   |    3 +-
 .../openmptarget/TestOpenMPTarget.hpp         |    1 -
 .../unit_test/sycl/TestSYCL_InterOp_Init.cpp  |    5 +-
 .../sycl/TestSYCL_InterOp_Init_Context.cpp    |    4 +-
 .../sycl/TestSYCL_InterOp_Streams.cpp         |    8 +-
 .../sycl/TestSYCL_TeamScratchStreams.cpp      |    2 +-
 .../unit_test/tools/TestLogicalSpaces.hpp     |    3 +
 .../tools/TestWithoutInitializing.cpp         |   76 +
 .../tools/include/ToolTestingUtilities.hpp    |   18 +
 .../build_cmake_installed/CMakeLists.txt      |    6 -
 .../build_cmake_installed/cmake_example.cpp   |    2 +-
 .../CMakeLists.txt                            |    4 -
 .../foo.cpp                                   |    2 +-
 .../cmake_example.cpp                         |    2 +-
 .../simple_mdrangepolicy.cpp                  |   36 +-
 packages/kokkos/generate_makefile.bash        |   26 +-
 packages/kokkos/gnu_generate_makefile.bash    |    9 +
 packages/kokkos/master_history.txt            |    1 +
 .../kokkos/scripts/docker/Dockerfile.nvhpc    |    4 +-
 .../scripts/docker/Dockerfile.openmptarget    |    2 +-
 .../kokkos/scripts/docker/Dockerfile.sycl     |    4 +-
 .../scripts/testing_scripts/test_all_sandia   |   30 +-
 packages/kokkos/simd/CMakeLists.txt           |   10 +
 packages/kokkos/simd/cmake/Dependencies.cmake |    5 +
 packages/kokkos/simd/src/CMakeLists.txt       |   29 +
 packages/kokkos/simd/src/Kokkos_SIMD.hpp      |  161 +
 .../kokkos/simd/src/Kokkos_SIMD_AVX512.hpp    | 1023 +++++++
 .../kokkos/simd/src/Kokkos_SIMD_Common.hpp    |  428 +++
 .../kokkos/simd/src/Kokkos_SIMD_Scalar.hpp    |  353 +++
 .../kokkos/simd/src/Kokkos_SIMD_dummy.cpp     |    7 +
 .../kokkos/simd/unit_tests/CMakeLists.txt     |    5 +
 packages/kokkos/simd/unit_tests/TestSIMD.cpp  |  376 +++
 .../kokkos/simd/unit_tests/UnitTestMain.cpp   |   54 +
 .../desul/include}/desul/.clang-format        |    0
 .../desul/include}/desul/atomics.hpp          |    5 +-
 .../include}/desul/atomics/Atomic_Ref.hpp     |   64 +-
 .../tpls/desul/include/desul/atomics/CUDA.hpp |  664 ++++
 .../desul/include}/desul/atomics/Common.hpp   |   40 +-
 .../desul/atomics/Compare_Exchange.hpp        |    3 +-
 .../desul/atomics/Compare_Exchange_CUDA.hpp   |  131 +-
 .../desul/atomics/Compare_Exchange_GCC.hpp    |   45 +-
 .../desul/atomics/Compare_Exchange_HIP.hpp    |   42 +-
 .../desul/atomics/Compare_Exchange_MSVC.hpp   |    0
 .../desul/atomics/Compare_Exchange_OpenMP.hpp |   79 +-
 .../desul/atomics/Compare_Exchange_SYCL.hpp   |   99 +
 .../atomics/Compare_Exchange_ScopeCaller.hpp  |    0
 .../desul/atomics/Compare_Exchange_Serial.hpp |   12 +-
 .../desul/include}/desul/atomics/GCC.hpp      |   36 +-
 .../desul/include}/desul/atomics/Generic.hpp  |    0
 .../desul/include}/desul/atomics/HIP.hpp      |    3 -
 .../include}/desul/atomics/Lock_Array.hpp     |   13 +-
 .../desul/atomics/Lock_Array_Cuda.hpp         |   65 +-
 .../include}/desul/atomics/Lock_Array_HIP.hpp |   37 +-
 .../desul/include}/desul/atomics/Macros.hpp   |    0
 .../desul/include}/desul/atomics/OpenMP.hpp   |    4 +-
 .../tpls/desul/include/desul/atomics/SYCL.hpp |   64 +
 .../desul/atomics/SYCLConversions.hpp         |   24 +-
 .../include}/desul/atomics/cuda/CUDA_asm.hpp  |    0
 .../desul/atomics/cuda/CUDA_asm_exchange.hpp  |    0
 .../desul/atomics/cuda/cuda_cc7_asm.inc       |    0
 .../cuda/cuda_cc7_asm_atomic_fetch_op.inc     |    0
 ...da_cc7_asm_atomic_fetch_op.inc_forceglobal |    0
 .../cuda_cc7_asm_atomic_fetch_op.inc_generic  |    0
 .../cuda_cc7_asm_atomic_fetch_op.inc_isglobal |    0
 ...cuda_cc7_asm_atomic_fetch_op.inc_predicate |    0
 .../atomics/cuda/cuda_cc7_asm_atomic_op.inc   |    0
 .../cuda_cc7_asm_atomic_op.inc_forceglobal    |    0
 .../cuda/cuda_cc7_asm_atomic_op.inc_generic   |    0
 .../cuda/cuda_cc7_asm_atomic_op.inc_isglobal  |    0
 .../cuda/cuda_cc7_asm_atomic_op.inc_predicate |    0
 .../atomics/cuda/cuda_cc7_asm_exchange.inc    |    0
 .../cuda/cuda_cc7_asm_exchange_memorder.inc   |    0
 .../atomics/cuda/cuda_cc7_asm_exchange_op.inc |    0
 .../atomics/cuda/cuda_cc7_asm_memorder.inc    |    0
 .../desul/atomics/openmp/OpenMP_40.hpp        |    0
 .../desul/atomics/openmp/OpenMP_40_op.inc     |    0
 .../desul/src/Lock_Array_CUDA.cpp             |   33 +-
 .../src => tpls}/desul/src/Lock_Array_HIP.cpp |   23 +-
 packages/kokkos/tpls/gtest/gtest/gtest-all.cc |   31 -
 src/utils/Array.hpp                           |   39 +-
 src/utils/PugsUtils.cpp                       |    4 +-
 src/utils/Table.hpp                           |   89 +-
 tests/mpi_test_main.cpp                       |   10 +-
 tests/test_main.cpp                           |   10 +-
 673 files changed, 42751 insertions(+), 28596 deletions(-)
 create mode 100644 packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
 rename packages/kokkos/algorithms/src/std_algorithms/{numeric => }/Kokkos_AdjacentDifference.hpp (72%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{Kokkos_ModifyingOperations.hpp => Kokkos_IterSwap.hpp} (79%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElementOperations.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitioningOperations.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{numeric => }/Kokkos_Reduce.hpp (70%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_SortingOperations.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{numeric => }/Kokkos_TransformReduce.hpp (62%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{ => impl}/Kokkos_Constraints.hpp (100%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
 rename packages/kokkos/{core/src/Threads/Kokkos_ThreadsExec_base.cpp => algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp} (52%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{ => impl}/Kokkos_HelperPredicates.hpp (97%)
 rename packages/kokkos/algorithms/src/std_algorithms/{numeric => impl}/Kokkos_IdentityReferenceUnaryFunctor.hpp (93%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{ => impl}/Kokkos_RandomAccessIterator.hpp (88%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{ => impl}/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp (91%)
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
 create mode 100644 packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
 rename packages/kokkos/algorithms/src/std_algorithms/{ => impl}/Kokkos_ValueWrapperForNoNeutralElement.hpp (89%)
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet1.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet2.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_ExclusiveScan.hpp
 delete mode 100644 packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_InclusiveScan.hpp
 create mode 100644 packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
 delete mode 100644 packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsScalarRedVsView.cpp
 delete mode 100644 packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
 create mode 100644 packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
 create mode 100644 packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
 create mode 100644 packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
 create mode 100644 packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
 rename packages/kokkos/core/src/OpenMP/{Kokkos_OpenMP_Exec.cpp => Kokkos_OpenMP_Instance.cpp} (72%)
 rename packages/kokkos/core/src/OpenMP/{Kokkos_OpenMP_Exec.hpp => Kokkos_OpenMP_Instance.hpp} (80%)
 rename packages/kokkos/core/src/{impl => Serial}/Kokkos_Serial.cpp (82%)
 create mode 100644 packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
 create mode 100644 packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
 create mode 100644 packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
 rename packages/kokkos/core/src/{impl => Serial}/Kokkos_Serial_Task.cpp (92%)
 rename packages/kokkos/core/src/{impl => Serial}/Kokkos_Serial_Task.hpp (98%)
 create mode 100644 packages/kokkos/core/src/Serial/Kokkos_Serial_UniqueToken.hpp
 rename packages/kokkos/core/src/{impl => Serial}/Kokkos_Serial_WorkGraphPolicy.hpp (95%)
 delete mode 100644 packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
 create mode 100644 packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp
 create mode 100644 packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp
 create mode 100644 packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp
 create mode 100644 packages/kokkos/core/src/Threads/Kokkos_Threads_UniqueToken.hpp
 create mode 100644 packages/kokkos/core/src/View/Hooks/Kokkos_ViewHooks.hpp
 rename packages/kokkos/{algorithms/src/std_algorithms/Kokkos_ModifyingSequenceOperations.hpp => core/src/decl/Kokkos_Declare_OPENACC.hpp} (89%)
 delete mode 100644 packages/kokkos/core/src/desul/atomics/CUDA.hpp
 delete mode 100644 packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
 delete mode 100644 packages/kokkos/core/src/desul/atomics/SYCL.hpp
 create mode 100644 packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENACC.hpp
 rename packages/kokkos/core/src/impl/{Kokkos_ExecSpaceInitializer.hpp => Kokkos_DeviceManagement.hpp} (75%)
 create mode 100644 packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
 delete mode 100644 packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
 create mode 100644 packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
 create mode 100644 packages/kokkos/core/src/impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp
 create mode 100644 packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestAbort.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestArray.cpp
 create mode 100644 packages/kokkos/core/unit_test/TestInitializationSettings.cpp
 create mode 100644 packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestLegionInteroperability.cpp
 create mode 100644 packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
 create mode 100644 packages/kokkos/core/unit_test/TestStringManipulation.cpp
 delete mode 100644 packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestViewHooks.hpp
 create mode 100644 packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
 rename packages/kokkos/{algorithms/src/std_algorithms/Kokkos_Numeric.hpp => core/unit_test/category_files/TestHIPManaged_Category.hpp} (82%)
 create mode 100644 packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
 create mode 100644 packages/kokkos/simd/CMakeLists.txt
 create mode 100644 packages/kokkos/simd/cmake/Dependencies.cmake
 create mode 100644 packages/kokkos/simd/src/CMakeLists.txt
 create mode 100644 packages/kokkos/simd/src/Kokkos_SIMD.hpp
 create mode 100644 packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
 create mode 100644 packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
 create mode 100644 packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
 create mode 100644 packages/kokkos/simd/src/Kokkos_SIMD_dummy.cpp
 create mode 100644 packages/kokkos/simd/unit_tests/CMakeLists.txt
 create mode 100644 packages/kokkos/simd/unit_tests/TestSIMD.cpp
 create mode 100644 packages/kokkos/simd/unit_tests/UnitTestMain.cpp
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/.clang-format (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics.hpp (98%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Atomic_Ref.hpp (91%)
 create mode 100644 packages/kokkos/tpls/desul/include/desul/atomics/CUDA.hpp
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Common.hpp (91%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange.hpp (99%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_CUDA.hpp (69%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_GCC.hpp (65%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_HIP.hpp (88%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_MSVC.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_OpenMP.hpp (71%)
 create mode 100644 packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_ScopeCaller.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Compare_Exchange_Serial.hpp (83%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/GCC.hpp (90%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Generic.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/HIP.hpp (99%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Lock_Array.hpp (81%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Lock_Array_Cuda.hpp (75%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Lock_Array_HIP.hpp (82%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/Macros.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/OpenMP.hpp (87%)
 create mode 100644 packages/kokkos/tpls/desul/include/desul/atomics/SYCL.hpp
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/SYCLConversions.hpp (79%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/CUDA_asm.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/CUDA_asm_exchange.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_exchange.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/cuda/cuda_cc7_asm_memorder.inc (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/openmp/OpenMP_40.hpp (100%)
 rename packages/kokkos/{core/src => tpls/desul/include}/desul/atomics/openmp/OpenMP_40_op.inc (100%)
 rename packages/kokkos/{core/src => tpls}/desul/src/Lock_Array_CUDA.cpp (79%)
 rename packages/kokkos/{core/src => tpls}/desul/src/Lock_Array_HIP.cpp (83%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 368d9390c..fb52f0b84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,6 +285,7 @@ add_subdirectory("${PUGS_SOURCE_DIR}/packages/kokkos")
 # set as SYSTEM for static analysis
 include_directories(SYSTEM ${KOKKOS_SOURCE_DIR}/core/src)
 include_directories(SYSTEM ${KOKKOS_SOURCE_DIR}/containers/src)
+include_directories(SYSTEM ${KOKKOS_SOURCE_DIR}/tpls/desul/include)
 include_directories(SYSTEM ${KOKKOS_BINARY_DIR})
 
 set(PUGS_BUILD_KOKKOS_DEVICES "")
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
new file mode 100644
index 000000000..b17d173ba
--- /dev/null
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml
@@ -0,0 +1,88 @@
+name: github-Linux-hpx
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{github.event_name == 'pull_request'}}
+
+jobs:
+  hpx:
+    name: hpx
+    runs-on: [ubuntu-latest]
+
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v2.2.0
+        with:
+          path: kokkos
+      - name: setup hpx dependencies
+        run: |
+          sudo apt update
+          sudo apt install \
+            clang \
+            hwloc \
+            libasio-dev \
+            libboost-all-dev \
+            ninja-build
+      - name: checkout hpx
+        uses: actions/checkout@v2.2.0
+        with:
+          repository: STELLAR-GROUP/hpx
+          ref: 1.7.1
+          path: hpx
+      - uses: actions/cache@v2
+        id:   cache-hpx
+        with:
+          path:         ./hpx/install
+          key:          kokkos-hwloc-${{ github.ref }}-${{ github.sha }}
+          restore-keys: kokkos-hwloc-${{ github.ref }}
+      - name: configure hpx
+        if: steps.cache-hpx.outputs.cache-hit != 'true'
+        run: |
+          mkdir -p hpx/{build,install}
+          cd hpx/build
+          cmake \
+            -GNinja \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DCMAKE_INSTALL_PREFIX=$PWD/../install \
+            -DCMAKE_CXX_COMPILER=clang++ \
+            -DHPX_WITH_UNITY_BUILD=ON \
+            -DHPX_WITH_MALLOC=system \
+            -DHPX_WITH_NETWORKING=OFF \
+            -DHPX_WITH_EXAMPLES=OFF \
+            -DHPX_WITH_TESTS=OFF \
+            ..
+      - name: build and install hpx
+        if: steps.cache-hpx.outputs.cache-hit != 'true'
+        working-directory: hpx/build
+        run: ninja -j2 install
+
+      - name: configure kokkos
+        run: |
+          mkdir -p kokkos/{build,install}
+          cd kokkos/build
+          cmake \
+            -GNinja \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DCMAKE_INSTALL_PREFIX=$PWD/../install \
+            -DCMAKE_CXX_COMPILER=clang++ \
+            -DCMAKE_CXX_FLAGS="-Werror" \
+            -DHPX_ROOT=$PWD/../../hpx/install \
+            -DKokkos_ARCH_NATIVE=ON \
+            -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+            -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
+            -DKokkos_ENABLE_EXAMPLES=ON \
+            -DKokkos_ENABLE_HPX=ON \
+            -DKokkos_ENABLE_HPX_ASYNC_DISPATCH=ON \
+            -DKokkos_ENABLE_SERIAL=OFF \
+            -DKokkos_ENABLE_TESTS=ON \
+            ..
+
+      - name: build_and_install_kokkos
+        working-directory: kokkos/build
+        run: ninja -j2 install
+
+      - name: test_kokkos
+        working-directory: kokkos/build
+        run: ctest --timeout 2000 -j2 --output-on-failure
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index dba3a7074..b2b4bfc31 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -14,27 +14,33 @@ jobs:
         cxx: ['g++', 'clang++']
         cmake_build_type: ['Release', 'Debug']
         backend: ['OPENMP']
+        clang-tidy: ['']
         include:
           - distro: 'fedora:intel'
             cxx: 'icpc'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
+            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpc'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
+            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpx'
             cmake_build_type: 'Release'
             backend: 'OPENMP'
+            clang-tidy: ''
           - distro: 'fedora:intel'
             cxx: 'icpx'
             cmake_build_type: 'Debug'
             backend: 'OPENMP'
+            clang-tidy: ''
           - distro: 'ubuntu:latest'
             cxx: 'clang++'
             cmake_build_type: 'RelWithDebInfo'
             backend: 'THREADS'
+            clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"'
           - distro: 'ubuntu:latest'
             cxx: 'g++'
             cmake_build_type: 'RelWithDebInfo'
@@ -45,6 +51,21 @@ jobs:
       # see https://github.com/actions/virtual-environments/issues/3812
       options: --security-opt seccomp=unconfined
     steps:
+      - name: Checkout desul
+        uses: actions/checkout@v2.2.0
+        with:
+          repository: desul/desul
+          ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b
+          path: desul
+      - name: Install desul
+        working-directory: desul
+        run: |
+          git submodule init
+          git submodule update
+          mkdir build
+          cd build
+          cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install ..
+          sudo cmake --build . --target install --parallel 2
       - name: Checkout code
         uses: actions/checkout@v2.2.0
       - uses: actions/cache@v2
@@ -58,11 +79,17 @@ jobs:
       - name: maybe_use_external_gtest
         if: ${{ matrix.distro == 'ubuntu:latest' }}
         run: sudo apt-get update && sudo apt-get install -y libgtest-dev
-      - name: CMake
+      - name: maybe_install_clang_tidy
+        if: ${{ matrix.clang-tidy != '' }}
+        run: sudo apt-get update && sudo apt-get install -y clang-tidy
+      - name: Configure Kokkos
         run: |
           cmake -B builddir \
             -DCMAKE_INSTALL_PREFIX=/usr \
+            ${{ matrix.clang-tidy }} \
+            -Ddesul_ROOT=/usr/desul-install/ \
             -DKokkos_ARCH_NATIVE=ON \
+            -DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \
             -DKokkos_ENABLE_HWLOC=ON \
             -DKokkos_ENABLE_${{ matrix.backend }}=ON \
             -DKokkos_ENABLE_TESTS=ON \
diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml
index 69a09adf8..0e043c5f8 100644
--- a/packages/kokkos/.github/workflows/osx.yml
+++ b/packages/kokkos/.github/workflows/osx.yml
@@ -31,6 +31,7 @@ jobs:
             -DKokkos_ENABLE_${{ matrix.backend }}=On
             -DCMAKE_CXX_FLAGS="-Werror"
             -DCMAKE_CXX_STANDARD=14
+            -DKokkos_ARCH_NATIVE=ON
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON
             -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF
             -DKokkos_ENABLE_TESTS=On
diff --git a/packages/kokkos/.gitignore b/packages/kokkos/.gitignore
index eb2257762..a36540be8 100644
--- a/packages/kokkos/.gitignore
+++ b/packages/kokkos/.gitignore
@@ -12,6 +12,7 @@ testing/
 /out/build
 /CMakeSettings.json
 /out/mytest
+CMakeUserPresets.json
 
 # build directories in source tree
 /build*
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index 91e0f8daa..9b53d5276 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = d19aab9981a2c447e832a7b4eb7b16992328fb14
-	parent = a64ea7589ca011edd41ad9a3468d091cd093430c
+	commit = 61d7db55fceac3318c987a291f77b844fd94c165
+	parent = 91d53e3cfb9a55832aae102ca677044a47f2515d
 	method = merge
 	cmdver = 0.4.3
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index b5d7fc307..09052840e 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -25,25 +25,57 @@ pipeline {
                 sh './scripts/docker/check_format_cpp.sh'
             }
         }
-
         stage('Build') {
             parallel {
-                stage('CUDA-11.4-NVHPC') {
+                stage('OPENACC-NVHPC-CUDA-11.6') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.nvhpc'
                             dir 'scripts/docker'
-                            label 'nvidia-docker && volta && large_images'
+                            label 'nvidia-docker && large_images'
+                            args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
+                        }
+                    }
+                    steps {
+                        sh '''rm -rf build && mkdir -p build && cd build && \
+                              /opt/cmake/bin/cmake \
+                                -DCMAKE_CXX_COMPILER=nvc++ \
+                                -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ARCH_NATIVE=ON \
+                                -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_TESTS=ON \
+                                -DKokkos_ENABLE_OPENACC=ON \
+                                -DKokkos_ARCH_VOLTA70=ON \
+                              .. && \
+                              make -j8 && ctest --verbose'''
+                    }
+                }
+                stage('CUDA-11.6-NVHPC') {
+                    agent {
+                        dockerfile {
+                            filename 'Dockerfile.nvhpc'
+                            dir 'scripts/docker'
+                            label 'nvidia-docker && large_images'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
                     }
+                    environment {
+                        OMP_NUM_THREADS = 8
+                        // Nested OpenMP does not work for this configuration,
+                        // so disabling it
+                        OMP_MAX_ACTIVE_LEVELS = 1
+                        OMP_PLACES = 'threads'
+                        OMP_PROC_BIND = 'spread'
+                        NVCC_WRAPPER_DEFAULT_COMPILER = 'nvc++'
+                    }
                     steps {
                         sh '''rm -rf build && mkdir -p build && cd build && \
                               /opt/cmake/bin/cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
-                                -DCMAKE_CXX_COMPILER=nvc++ \
+                                -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
@@ -70,7 +102,8 @@ pipeline {
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
                                 -DCMAKE_CXX_COMPILER=clang++ \
-                                -DCMAKE_CXX_FLAGS="-Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \
+                                -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
@@ -101,6 +134,7 @@ pipeline {
                     }
                     environment {
                         OMP_NUM_THREADS = 8
+                        OMP_MAX_ACTIVE_LEVELS = 3
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
                     }
@@ -113,6 +147,7 @@ pipeline {
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \
                                 -DCMAKE_CXX_STANDARD=14 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
@@ -146,6 +181,7 @@ pipeline {
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
                                 -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
@@ -160,6 +196,7 @@ pipeline {
                         }
                     }
                 }
+/*
                 stage('OPENMPTARGET-ROCm-4.5') {
                     agent {
                         dockerfile {
@@ -172,6 +209,7 @@ pipeline {
                     }
                     environment {
                         OMP_NUM_THREADS = 8
+                        OMP_MAX_ACTIVE_LEVELS = 3
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
                         LC_ALL = 'C'
@@ -185,6 +223,7 @@ pipeline {
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER=amdclang++ \
                                 -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
@@ -202,6 +241,7 @@ pipeline {
                         }
                     }
                 }
+*/
                 stage('OPENMPTARGET-Clang') {
                     agent {
                         dockerfile {
@@ -218,6 +258,7 @@ pipeline {
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=clang++ \
                                 -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
@@ -254,6 +295,7 @@ pipeline {
                                 -DCMAKE_CXX_COMPILER=clang++ \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=14 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
@@ -306,12 +348,15 @@ pipeline {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
                             additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3'
-                            label 'nvidia-docker && volta'
+                            label 'nvidia-docker'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
                     }
                     environment {
                         OMP_NUM_THREADS = 8
+                        // Nested OpenMP does not work for this configuration,
+                        // so disabling it
+                        OMP_MAX_ACTIVE_LEVELS = 1
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
                         NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8'
@@ -325,6 +370,7 @@ pipeline {
                                 -DCMAKE_CXX_COMPILER=g++-8 \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ENABLE_CUDA=ON \
@@ -343,7 +389,7 @@ pipeline {
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
                                 -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
-                                -DCMAKE_CXX_FLAGS=-Werror \
+                                -DCMAKE_CXX_FLAGS=-Werror --Werror=all-warnings -Xcudafe --diag_suppress=3159 \
                                 -DCMAKE_CXX_STANDARD=17 \
                                 -DKokkos_INSTALL_TESTING=ON \
                               .. && \
@@ -367,13 +413,13 @@ pipeline {
                         }
                     }
                 }
-                stage('CUDA-10.1-NVCC-DEBUG') {
+                stage('CUDA-11.6-NVCC-DEBUG') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel'
-                            label 'nvidia-docker && volta'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.0-devel-ubuntu20.04'
+                            label 'nvidia-docker'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
                     }
@@ -386,6 +432,7 @@ pipeline {
                                 -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=14 \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEBUG=ON \
                                 -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON \
@@ -416,6 +463,7 @@ pipeline {
                     }
                     environment {
                         OMP_NUM_THREADS = 8
+                        OMP_MAX_ACTIVE_LEVELS = 3
                         OMP_PROC_BIND = 'true'
                     }
                     steps {
@@ -424,6 +472,7 @@ pipeline {
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DCMAKE_CXX_FLAGS=-Werror \
+                                -DKokkos_ARCH_NATIVE=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
                                 -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
@@ -431,6 +480,7 @@ pipeline {
                                 -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
                                 -DKokkos_ENABLE_LIBQUADMATH=ON \
+                                -DKokkos_ENABLE_SERIAL=ON \
                                 -DCMAKE_PREFIX_PATH=/usr/lib/gcc/x86_64-linux-gnu/5.3.1 \
                               .. && \
                               make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c'''
diff --git a/packages/kokkos/BUILD.md b/packages/kokkos/BUILD.md
index 114baf99f..a8985ef1f 100644
--- a/packages/kokkos/BUILD.md
+++ b/packages/kokkos/BUILD.md
@@ -27,7 +27,7 @@ When configuring your project just set:
   -DKokkos_ROOT=${kokkos_install_prefix} \
   -DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos}
 ````
-Note: You may need the following if using some versions of CMake (e.g. 3.12):
+Note: You may need the following if your project requires a minimum CMake version older than 3.12:
 ````cmake
 cmake_policy(SET CMP0074 NEW)
 ````
@@ -171,6 +171,9 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
 * Kokkos_ENABLE_HPX_ASYNC_DISPATCH
     * Whether HPX supports asynchronous dispatch
     * BOOL Default: OFF
+* Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+    * Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details.
+    * BOOL Default: OFF
 * Kokkos_ENABLE_LARGE_MEM_TESTS
     * Whether to perform extra large memory tests
     * BOOL_Default: OFF
@@ -235,6 +238,9 @@ The following options control `find_package` paths for CMake-based TPLs:
 ## Architecture Keywords
 Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`.
 
+* Kokkos_ARCH_NATIVE
+    * Whether to optimize for the the local CPU architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_AMDAVX
     * Whether to optimize for the AMDAVX architecture
     * BOOL Default: OFF
@@ -310,12 +316,24 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_
 * Kokkos_ARCH_POWER9
     * Whether to optimize for the POWER9 architecture
     * BOOL Default: OFF
+* Kokkos_ARCH_ICL
+    * Whether to optimize for the ICL architecture
+    * BOOL Default: OFF
+* Kokkos_ARCH_ICX
+    * Whether to optimize for the ICX architecture
+    * BOOL Default: OFF
+* Kokkos_ARCH_SKL
+    * Whether to optimize for the SKL architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_SKX
     * Whether to optimize for the SKX architecture
     * BOOL Default: OFF
 * Kokkos_ARCH_SNB
     * Whether to optimize for the SNB architecture
     * BOOL Default: OFF
+* Kokkos_ARCH_SPR
+    * Whether to optimize for the SPR architecture
+    * BOOL Default: OFF
 * Kokkos_ARCH_TURING75
     * Whether to optimize for the TURING75 architecture
     * BOOL Default: OFF
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index a90850770..e81f29445 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,157 @@
 # Change Log
 
+## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
+
+### Features:
+- Use non-volatile `join()` member functions and `operator+=` in `parallel_reduce/scan` [\#4931](https://github.com/kokkos/kokkos/pull/4931) [\#4954](https://github.com/kokkos/kokkos/pull/4954) [\#4951](https://github.com/kokkos/kokkos/pull/4951)
+- Add `SIMD` sub package (requires C++17) [\#5016](https://github.com/kokkos/kokkos/pull/5016)
+- Add `is_finalized()` [\#5247](https://github.com/kokkos/kokkos/pull/5247)
+- Promote mathematical functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#4791](https://github.com/kokkos/kokkos/pull/4791)
+- Promote `min`, `max`, `clamp`, `minmax` functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#5170](https://github.com/kokkos/kokkos/pull/5170)
+- Add `round`, `logb`, `nextafter`, `copysign`, and `signbit` math functions [\#4768](https://github.com/kokkos/kokkos/pull/4768)
+- Add `HIPManagedSpace`, similar to `CudaUVMSpace` [\#5112](https://github.com/kokkos/kokkos/pull/5112)
+- Accept view construction allocation properties in `create_mirror[_view,_view_and_copy]` and `resize/realloc` [\#5125](https://github.com/kokkos/kokkos/pull/5125) [\#5095](https://github.com/kokkos/kokkos/pull/5095) [\#5035](https://github.com/kokkos/kokkos/pull/5035) [\#4805](https://github.com/kokkos/kokkos/pull/4805) [\#4844](https://github.com/kokkos/kokkos/pull/4844)
+- Allow `MemorySpace::allocate()` to be called with execution space [\#4826](https://github.com/kokkos/kokkos/pull/4826)
+- Experimental: Compile time view subscriber [\#4197](https://github.com/kokkos/kokkos/pull/4197)
+
+### Backends and Archs Enhancements:
+- Add support for Sapphire Rapids Intel architecture [\#5015](https://github.com/kokkos/kokkos/pull/5015)
+- Add support for ICX, SKL and ICL Intel architectures [\#5013](https://github.com/kokkos/kokkos/pull/5013) [\#4929](https://github.com/kokkos/kokkos/pull/4929)
+- Add arch flags for Intel GPU Ponte Vecchio [\#4932](https://github.com/kokkos/kokkos/pull/4932)
+- SYCL: require GPU if GPU architecture was set at configuration time (i.e. do not allow fallback to CPU device) [\#5264](https://github.com/kokkos/kokkos/pull/5264) [\#5222](https://github.com/kokkos/kokkos/pull/5222)
+- SYCL: Add `SYCL::sycl_queue()` for interoperability [\#5241](https://github.com/kokkos/kokkos/pull/5241)
+- SYCL: Loosen restriction for using built-in `sycl::group_broadcast` [\#4552](https://github.com/kokkos/kokkos/pull/4552)
+- SYCL: preserve address space [\#4396](https://github.com/kokkos/kokkos/pull/4396)
+- OpenMPTarget: Adding a workaound for team scan [\#5219](https://github.com/kokkos/kokkos/pull/5219)
+- OpenMPTarget: Adding logic to skip the kernel launch if `league_size=0` [\#5067](https://github.com/kokkos/kokkos/pull/5067)
+- OpenMPTarget: Make sure `Kokkos::abort()` causes abnormal program termination when called on the host-side [\#4808](https://github.com/kokkos/kokkos/pull/4808)
+- HIP: Make HIPHostPinnedSpace coarse-grained [\#5152](https://github.com/kokkos/kokkos/pull/5152)
+- Refactor OpenMP `parallel_for` implementation to use more native OpenMP constructs [\#4664](https://github.com/kokkos/kokkos/pull/4664)
+- Add option to optimize for local CPU architecture `Kokkos_ARCH_NATIVE` [\#4930](https://github.com/kokkos/kokkos/pull/4930)
+
+
+### Implemented enhancements
+- Add command line argument/environment variable to print the configuration [\#5233](https://github.com/kokkos/kokkos/pull/5233)
+- Improve error message in view memory access violations [\#4950](https://github.com/kokkos/kokkos/pull/4950)
+- Remove unnecessary fences in View initialization [\#4823](https://github.com/kokkos/kokkos/pull/4823)
+- Make `View::shmem_size()` device-callable [\#4936](https://github.com/kokkos/kokkos/pull/4936)
+- Update numerics support for `__float128` [\#5081](https://github.com/kokkos/kokkos/pull/5081)
+- Add `log10` overload for `Kokkos::complex` [\#5009](https://github.com/kokkos/kokkos/pull/5009)
+- Add `[[nodiscard]]` to `ScopeGuard` [\#5224](https://github.com/kokkos/kokkos/pull/5224)
+- Add structured binding support for `Kokkos::Array` [\#4962](https://github.com/kokkos/kokkos/pull/4962)
+- Enable accessing `Kokkos::Array` elements in constant expressions [\#4916](https://github.com/kokkos/kokkos/pull/4916)
+- Mark `as_view_of_rank_n` as KOKKOS_FUNCTION [\#5248](https://github.com/kokkos/kokkos/pull/5248)
+- Cleanup/rework fence overloads [\#5148](https://github.com/kokkos/kokkos/pull/5148)
+- Assert that `Layout` construction from extents is valid in functions taking integer extents [\#5209](https://github.com/kokkos/kokkos/pull/5209)
+- Add `fill_random` overload that takes an execution space as first argument [\#5181](https://github.com/kokkos/kokkos/pull/5181)
+- Avoid some unnecessary fences in `parallel_reduce/scan` [\#5154](https://github.com/kokkos/kokkos/pull/5154)
+- Include `KOKKOS_ENABLE_LIBDL` in options when printing configuration [\#5086](https://github.com/kokkos/kokkos/pull/5086)
+- DynRankView: make `layout()` return the same as a corresponding static View [\#5026](https://github.com/kokkos/kokkos/pull/5026)
+- Use `_mm_malloc` for icpx [\#5012](https://github.com/kokkos/kokkos/pull/5012)
+- Avoid forcing matching execution spaces in `BinSort` constructor and `sort()` [\#4919](https://github.com/kokkos/kokkos/pull/4919)
+- Check number of bins in `BinSort` [\#4890](https://github.com/kokkos/kokkos/pull/4890)
+- Improve performance in parallel STL-like algorithms [\#4887](https://github.com/kokkos/kokkos/pull/4887) [\#4886](https://github.com/kokkos/kokkos/pull/4886)
+- Disable `memset` on A64FX and launch `parallel_for` instead (performance) [\#4884](https://github.com/kokkos/kokkos/pull/4884)
+- Allow non-power-of-two team sizes for team reductions and scans [\#4809](https://github.com/kokkos/kokkos/pull/4809)
+
+#### Harmonization of Kokkos execution environment initialization:
+- Warn when unable to detect local MPI rank and user explicitly asked for it [\#5263](https://github.com/kokkos/kokkos/pull/5263)
+- Refactor parsing of command line arguments and environment variables [\#5221](https://github.com/kokkos/kokkos/pull/5221)
+- Refactor device selection at initialization [\#5211](https://github.com/kokkos/kokkos/pull/5211)
+- Rename tools settings for consistency [\#5201](https://github.com/kokkos/kokkos/pull/5201)
+- Print help only once [\#5128](https://github.com/kokkos/kokkos/pull/5128)
+- Update precedence rule in initialization [\#5130](https://github.com/kokkos/kokkos/pull/5130)
+- Warn instead of just ignoring user settings when kokkos-tools is disabled [\#5088](https://github.com/kokkos/kokkos/pull/5088)
+- Drop numa args in threads backend initialization [\#5127](https://github.com/kokkos/kokkos/pull/5127)
+- Warn users when a flag prefixed with -[-]kokkos is not recognized and do not remove it [\#5256](https://github.com/kokkos/kokkos/pull/5256)
+- Give back to Core what belongs to Core (aka moving tune_internals option from Tools back to Core) [\#5202](https://github.com/kokkos/kokkos/pull/5202)
+
+#### Build system updates:
+- `nvcc_wrapper`: filter out -pedantic-errors from nvcc options [\#5235](https://github.com/kokkos/kokkos/pull/5235)
+- `nvcc_wrapper`: add known nvcc option --source-in-ptx [\#5052](https://github.com/kokkos/kokkos/pull/5052)
+- Link libdl as interface library [\#5179](https://github.com/kokkos/kokkos/pull/5179)
+- Only show GPU architectures with enabled corresponding backend [\#5119](https://github.com/kokkos/kokkos/pull/5119)
+- Enable optional external desul build [\#5021](https://github.com/kokkos/kokkos/pull/5021) [\#5132](https://github.com/kokkos/kokkos/pull/5132)
+- Export `Kokkos_CXX_STANDARD` variable with CMake [\#5068](https://github.com/kokkos/kokkos/pull/5068)
+- Suppress warnings with nvc++ [\#5031](https://github.com/kokkos/kokkos/pull/5031)
+- Disallow multiple host architectures in CMake [\#4996](https://github.com/kokkos/kokkos/pull/4996)
+- Do not include compiler warning flags in the compile option of the cmake target [\#4989](https://github.com/kokkos/kokkos/pull/4989)
+- AOT flags for OpenMPTarget targeting Intel GPUs [\#4915](https://github.com/kokkos/kokkos/pull/4915)
+- Repurpose `Kokkos_ARCH_INTEL_GEN` for SYCL to mean JIT to be conforming with OMPT [\#4894](https://github.com/kokkos/kokkos/pull/4894)
+- Replace amdgpu-target with offload-arch [\#4874](https://github.com/kokkos/kokkos/pull/4874)
+- Do not enable `kokkos_launch_compiler` when `CMAKE_CXX_COMPILER_LAUNCHER` is set [\#4870](https://github.com/kokkos/kokkos/pull/4870)
+- Move CMake version check up [\#4797](https://github.com/kokkos/kokkos/pull/4797)
+
+### Incompatibilities:
+- Remove `KOKKOS_THREAD_LOCAL` [\#5064](https://github.com/kokkos/kokkos/pull/5064)
+- Remove `KOKKOS_ENABLE_POSIX_MEMALIGN` [\#5011](https://github.com/kokkos/kokkos/pull/5011)
+- Remove unused `KOKKOS_ENABLE_TM` [\#4995](https://github.com/kokkos/kokkos/pull/4995)
+- Remove unused cmakedefine `KOKKOS_ENABLE_COMPILER_WARNINGS` [\#4883](https://github.com/kokkos/kokkos/pull/4883)
+- Remove unused `KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK` [\#4882](https://github.com/kokkos/kokkos/pull/4882)
+- Drop Instruction Set Architecture (ISA) macros [\#4981](https://github.com/kokkos/kokkos/pull/4981)
+- Warn in `ScopeGuard` about illegal usage [\#5250](https://github.com/kokkos/kokkos/pull/5250)
+
+### Deprecations:
+- Guard against non-public header inclusion [\#5178](https://github.com/kokkos/kokkos/pull/5178)
+- Raise deprecation warnings if non empty WorkTag class is used [\#5230](https://github.com/kokkos/kokkos/pull/5230)
+- Deprecate `parallel_*` overloads taking the label as trailing argument [\#5141](https://github.com/kokkos/kokkos/pull/5141)
+- Deprecate nested types in functional [\#5185](https://github.com/kokkos/kokkos/pull/5185)
+- Deprecate `InitArguments` struct and replace it with `InitializationSettings` [\#5135](https://github.com/kokkos/kokkos/pull/5135)
+- Deprecate `finalize_all()` [\#5134](https://github.com/kokkos/kokkos/pull/5134)
+- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
+- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
+- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
+- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
+- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
+- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
+- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
+- Warn about `parallel_reduce` cases that call `join()` with volatile-qualified arguments [\#5215](https://github.com/kokkos/kokkos/pull/5215)
+
+### Bug Fixes:
+- CUDA Reductions: Fix data races reported by Nvidia `compute-sanitizer` [\#4855](https://github.com/kokkos/kokkos/pull/4855)
+- Work around Intel compiler bug [\#5301](https://github.com/kokkos/kokkos/pull/5301)
+- Avoid allocating memory for UniqueToken [\#5300](https://github.com/kokkos/kokkos/pull/5300)
+- DynamicView: Properly resize mirror instances after construction [\#5276](https://github.com/kokkos/kokkos/pull/5276)
+- Remove Kokkos::Rank limit of 6 ranks [\#5271](https://github.com/kokkos/kokkos/pull/5271)
+- Do not forget to set last element to nullptr when removing a flag in `Kokkos::initialize` [\#5272](https://github.com/kokkos/kokkos/pull/5272)
+- Fix CUDA+MSVC build issue [\#5261](https://github.com/kokkos/kokkos/pull/5261)
+- Fix `DynamicView::resize_serial` [\#5220](https://github.com/kokkos/kokkos/pull/5220)
+- Fix cmake default compiler flags for unknown compiler [\#5217](https://github.com/kokkos/kokkos/pull/5217)
+- Fix `move_backward` [\#5191](https://github.com/kokkos/kokkos/pull/5191)
+- Fixing issue 5196 - missing symbol with intel compiler [\#5207](https://github.com/kokkos/kokkos/pull/5207)
+- Preserve `KOKKOS_INVALID_INDEX` in ViewDimension and ArrayLayout construction [\#5188](https://github.com/kokkos/kokkos/pull/5188)
+- Finalize `deep_copy_space` early avoiding printing to `std::cerr` for Cuda [\#5151](https://github.com/kokkos/kokkos/pull/5151)
+- Use correct policy in Threads MDRange `parallel_reduce` [\#5123](https://github.com/kokkos/kokkos/pull/5123)
+- Fix building with NVCC as the CXX compiler while the CUDA backend is not enabled [\#5115](https://github.com/kokkos/kokkos/pull/5115)
+- OpenMPTarget Index range fix for MDRange. [\#5089](https://github.com/kokkos/kokkos/pull/5089)
+- Fix bug with CUDA's team reduction for empty ranges [\#5079](https://github.com/kokkos/kokkos/pull/5079)
+- Fix using `ZeroMemset` for Serial [\#5077](https://github.com/kokkos/kokkos/pull/5077)
+- Fix `Kokkos::Vector::push_back` for default execution space [\#5047](https://github.com/kokkos/kokkos/pull/5047)
+- ScatterView: Fix ScatterMin/ScatterMax to use proper atomics [\#5045](https://github.com/kokkos/kokkos/pull/5045)
+- Fix calling `ZeroMemset` in `deep_copy` [\#5040](https://github.com/kokkos/kokkos/pull/5040)
+- Make View self-assignment not produce double-free [\#5024](https://github.com/kokkos/kokkos/pull/5024)
+- Guard against unrecognized pragma with intel compilers [\#5019](https://github.com/kokkos/kokkos/pull/5019)
+- Fix racing condition in `HIPParallelLaunch` [\#5008](https://github.com/kokkos/kokkos/pull/5008)
+- KokkosP: Fix `device_id` in profiling [\#4997](https://github.com/kokkos/kokkos/pull/4997)
+- Fix for `Kokkos::vector::insert` into empty vector with begin and end iterators [\#4988](https://github.com/kokkos/kokkos/pull/4988)
+- Fix Core header files installation [\#4984](https://github.com/kokkos/kokkos/pull/4984)
+- Fix bounds errors with `Kokkos::sort` [\#4980](https://github.com/kokkos/kokkos/pull/4980)
+- Fixup let `RangePolicy::set_chunk_size` return a reference to self [\#4918](https://github.com/kokkos/kokkos/pull/4918)
+- Fix allocating large Views [\#4907](https://github.com/kokkos/kokkos/pull/4907)
+- Fix combined reductions with `Kokkos::View` [\#4896](https://github.com/kokkos/kokkos/pull/4896)
+- Fixed `_CUDA_ARCH__` to `__CUDA_ARCH__` for CUDA LDG [\#4893](https://github.com/kokkos/kokkos/pull/4893)
+- Fixup `View::access()` truncate parameter pack [\#4876](https://github.com/kokkos/kokkos/pull/4876)
+- Fix `abort` with HIP backend for ROCm 5.0.2 and beyond [\#4873](https://github.com/kokkos/kokkos/pull/4873)
+- Fix HIP version when printing the configuration [\#4872](https://github.com/kokkos/kokkos/pull/4872)
+- Fix scratch lock array when using scratch level 1 [\#4871](https://github.com/kokkos/kokkos/pull/4871)
+- Fix Makefile.kokkos to work with fujitsu compiler [\#4867](https://github.com/kokkos/kokkos/pull/4867)
+- cmake: Correct link THREADS link option [\#4854](https://github.com/kokkos/kokkos/pull/4854)
+- UniqueToken `impl_acquire` function should be device only [\#4819](https://github.com/kokkos/kokkos/pull/4819)
+- Fix example calls to non existing static `print_configuration` [\#4806](https://github.com/kokkos/kokkos/pull/4806)
+- Fix requests for large team scratch sizes [\#4728](https://github.com/kokkos/kokkos/pull/4728)
+
+
 ## [3.6.01](https://github.com/kokkos/kokkos/tree/3.6.01) (2022-05-23)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.00...3.6.01)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index b0a54118a..a05bfcdb9 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -1,3 +1,4 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
 
 # Disable in-source builds to prevent source tree corruption.
 if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
@@ -28,11 +29,6 @@ SET(KOKKOS_SRC_PATH      ${Kokkos_SOURCE_DIR})
 SET(KOKKOS_PATH          ${Kokkos_SOURCE_DIR})
 SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
-# Needed to simplify syntax of if statements
-CMAKE_POLICY(SET CMP0054 NEW)
-# Needed to make IN_LIST a valid operator
-CMAKE_POLICY(SET CMP0057 NEW)
-
 # Is this a build as part of Trilinos?
 IF(COMMAND TRIBITS_PACKAGE_DECL)
   SET(KOKKOS_HAS_TRILINOS ON)
@@ -72,7 +68,6 @@ ENDFUNCTION()
 LIST(APPEND CMAKE_MODULE_PATH cmake/Modules)
 
 IF(NOT KOKKOS_HAS_TRILINOS)
-  cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
   set(CMAKE_DISABLE_SOURCE_CHANGES ON)
   set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
 
@@ -80,7 +75,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
   # downstream dependencies need to match this!
   SET(KOKKOS_COMPILE_LANGUAGE CXX)
   # use lower case here since we didn't parse options yet
-  IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
+  IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA)
 
     # Without this as a language for the package we would get a C++ compiler enabled.
     # but we still need a C++ compiler even if we build all our cpp files as CUDA only
@@ -90,9 +85,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
     # days.
     SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX)
 
-    IF (Kokkos_ENABLE_CUDA)
-      SET(KOKKOS_COMPILE_LANGUAGE CUDA)
-    ENDIF()
+    SET(KOKKOS_COMPILE_LANGUAGE CUDA)
   ENDIF()
 
   IF (Spack_WORKAROUND)
@@ -135,14 +128,11 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 6)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 7)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
-MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-CMAKE_POLICY(SET CMP0074 NEW)
-
 # Load either the real TriBITS or a TriBITS wrapper
 # for certain utility functions that are universal (like GLOBAL_SET)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake)
@@ -204,11 +194,16 @@ KOKKOS_SETUP_BUILD_ENVIRONMENT()
 OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF)
 
 SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontainers Kokkos::kokkosalgorithms)
-SET(KOKKOS_INT_LIBRARIES kokkos kokkoscore kokkoscontainers kokkosalgorithms)
+SET(KOKKOS_SUB_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms)
+IF (KOKKOS_CXX_STANDARD GREATER_EQUAL 17)
+  LIST(APPEND KOKKOS_EXT_LIBRARIES Kokkos::kokkossimd)
+  LIST(APPEND KOKKOS_SUB_LIBRARIES kokkossimd)
+ENDIF()
+SET(KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_SUB_LIBRARIES})
 SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES})
 
 IF (KOKKOS_HAS_TRILINOS)
-  SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+  SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
   SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR})
   SET(KOKKOS_IS_SUBDIRECTORY TRUE)
 ELSEIF(HAS_PARENT)
@@ -296,7 +291,7 @@ IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
   #Make sure in-tree projects can reference this as Kokkos::
   #to match the installed target names
   ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos)
-  TARGET_LINK_LIBRARIES(kokkos INTERFACE kokkoscore kokkoscontainers kokkosalgorithms)
+  TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_SUB_LIBRARIES})
   KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos)
 ENDIF()
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake)
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index 10c4bc463..d493abbf1 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -1,21 +1,21 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 6
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 7
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
 #KOKKOS_DEVICES ?= "OpenMP"
 KOKKOS_DEVICES ?= "Threads"
 # Options:
-# Intel:    KNC,KNL,SNB,HSW,BDW,SKX
+# Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
-# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP
+# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@@ -183,6 +183,8 @@ KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VE
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
+# TODO fujitsu can emulate gcc or clang. Only clang mode works at the moment.
+KOKKOS_INTERNAL_COMPILER_FUJITSU     := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),FUJITSU)
 
 # Check Host Compiler if using NVCC through nvcc_wrapper
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@@ -211,8 +213,23 @@ endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
   KOKKOS_INTENAL_COMPILER_CLANG = 0
 endif
+# Fujitsu passes also as clang and gcc respectively
+ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+    # TODO  handle gcc flags and workaround for bug?
+    # fujitsu (gcc mode) is bugged, see https://github.com/kokkos/kokkos/issues/4730
+    $(warning Warning: ${CXX} in Trad Mode '-Nnoclang' (default) is not recommended. Use 'CXX = ${CXX} -Nclang' instead.)
+    # HACK since fujitsu only accepts some gcc flags, disable gcc here?
+    # KOKKOS_INTERNAL_COMPILER_GCC = 0
+  endif
+  # TODO handle clang flags
+  # warnings: works fine as is
+  # openmp: handled
+  #KOKKOS_INTERNAL_COMPILER_CLANG = 0
+endif
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+  # TODO empty variable if fujitsu (clang mode) passes as clang
   KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell $(CXX) --version | grep version | cut -d ' ' -f3 | tr -d '.')
 
   ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@@ -262,7 +279,12 @@ else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
     else
-    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
+      # fujitsu (clang mode) fails with `=libomp`
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+      else
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+      endif
     endif
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
@@ -290,11 +312,15 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
-    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp -Wno-openmp-mapping
     KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget
   else
-    #Assume GCC
-    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping
+    else
+      #Assume GCC
+      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
+    endif
   endif
 endif
 
@@ -334,8 +360,12 @@ KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
+KOKKOS_INTERNAL_USE_ARCH_SKL := $(call kokkos_has_string,$(KOKKOS_ARCH),SKL)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
+KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL)
+KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX)
+KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR)
 
 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
@@ -343,6 +373,7 @@ KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),
 KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
 KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC)
 
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
@@ -426,19 +457,9 @@ KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_W
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
-KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
-
-# Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
-KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
-KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
-KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
-
-# Decide whether we can support transactional memory
-KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@@ -541,36 +562,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL")
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TM")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_X86_64")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_KNC")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCLE")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
-  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCBE")
-  tmp := $(call kokkos_append_header,"$H""endif")
-endif
-
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
 ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
@@ -1031,7 +1022,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
   endif
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKL), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xSKYLAKE
+    KOKKOS_LDFLAGS  += -xSKYLAKE
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+      else
+        # Nothing here yet.
+	KOKKOS_CXXFLAGS += -march=skylake
+	KOKKOS_LDFLAGS  += -march=skylake
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKX), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
 
   ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
@@ -1045,13 +1057,31 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
 
       else
         # Nothing here yet.
-        KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
-        KOKKOS_LDFLAGS  += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
+        KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512
+        KOKKOS_LDFLAGS  += -march=skylake-avx512 -mtune=skylake-avx512
       endif
     endif
   endif
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICL), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=icelake-client -mtune=icelake-client
+  KOKKOS_LDFLAGS += -march=icelake-client -mtune=icelake-client
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICX), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=icelake-server -mtune=icelake-server
+  KOKKOS_LDFLAGS += -march=icelake-server -mtune=icelake-server
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SPR), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
+  KOKKOS_CXXFLAGS += -march=sapphirerapids -mtune=sapphirerapids
+  KOKKOS_LDFLAGS += -march=sapphirerapids -mtune=sapphirerapids
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC")
   KOKKOS_CXXFLAGS += -mmic
@@ -1081,7 +1111,7 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64 -Xopenmp-target -march
   endif
   KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
 endif
@@ -1182,29 +1212,29 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900")
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx900
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908
   endif
   ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
   endif
 
 
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp)
   ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
+    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
   endif
 
   KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
@@ -1220,51 +1250,67 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   endif
 endif
 
-# Figure out the architecture flag for SYCL.
+# Figure out Intel architecture flags.
 ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
-  # Lets start with adding architecture defines
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
-    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
-    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
-  endif
+  KOKKOS_INTERNAL_LC_BACKEND := sycl
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_INTERNAL_LC_BACKEND := openmp
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen9"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen11"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen12lp"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device dg1"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device xehp"
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_PVC")
+  KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device 12.4.0"
+endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
 
-  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
-  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda -fsycl-dead-args-optimization
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
   KOKKOS_LDFLAGS+=-fsycl
-  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) -D__STRICT_ANSI__
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+  KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include
 else
   ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
     $(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) )
@@ -1349,7 +1395,7 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
   ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
-    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+    KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
   endif
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
@@ -1407,6 +1453,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
   KOKKOS_TPL_LIBRARY_NAMES += pthread
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.hpp)
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
@@ -1439,15 +1490,6 @@ ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
 endif
 
-# Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
-# device to avoid a link warning.
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-endif
-ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
-  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp,$(KOKKOS_SRC))
-endif
-
 # With Cygwin functions such as fdopen and fileno are not defined
 # when strict ansi is enabled. strict ansi gets enabled with -std=c++14
 # though. So we hard undefine it here. Not sure if that has any bad side effects
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
index a9cb12e1b..876726e94 100644
--- a/packages/kokkos/Makefile.targets
+++ b/packages/kokkos/Makefile.targets
@@ -16,10 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
 Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
-Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
-Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -41,6 +37,13 @@ Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/
 Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
 
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -50,8 +53,8 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
-Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
@@ -70,20 +73,18 @@ Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
 Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
-Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_HIP.cpp
+Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
-Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
 Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 endif
diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md
index f6c500cc1..033346e95 100644
--- a/packages/kokkos/README.md
+++ b/packages/kokkos/README.md
@@ -10,270 +10,48 @@ hierarchies and multiple types of execution resources. It currently can use
 CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.
 
-Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
-which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as
-profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
+**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.**
 
-# Learning about Kokkos
-
-The best way to start learning about Kokkos is going through the Kokkos Lectures.
-They are online available at https://kokkos.link/the-lectures and contain a mix
-of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
-capabilities.
-
-A programming guide and API reference can be found on the Wiki
-(https://github.com/kokkos/kokkos/wiki).
-
-For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
-
-For non-public questions send an email to
-crtrott(at)sandia.gov
-
-# Contributing to Kokkos
-
-We are open and try to encourage contributions from external developers.
-To do so please first open an issue describing the contribution and then issue
-a pull request against the develop branch. For larger features it may be good
-to get guidance from the core development team first through the github issue.
-
-Note that Kokkos Core is licensed under standard 3-clause BSD terms of use.
-Which means contributing to Kokkos allows anyone else to use your contributions
-not just for public purposes but also for closed source commercial projects.
-For specifics see the LICENSE file contained in the repository or distribution.
-
-# Requirements
-
-### Minimum Compiler Versions
-
-Generally Kokkos should work with all compiler versions newer than the minimum.
-However as in all sufficiently complex enough code, we have to work around compiler
-bugs with almost all compilers. So compiler versions we don't test may have issues
-we are unaware of.
-
-* GCC: 5.3.0
-* Clang: 4.0.0
-* Intel: 17.0.1
-* NVCC: 9.2.88
-* NVC++: 21.5
-* ROCm: 4.3
-* MSVC: 19.29
-* IBM XL: 16.1.1
-* Fujitsu: 4.5.0
-* ARM/Clang 20.1
-
-### Primary Tested Compilers
-
-* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
-* NVCC: 9.2.88, 10.1, 11.0
-* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
-* Intel 17.4, 18.1, 19.5
-* MSVC: 19.29
-* ARM/Clang: 20.1
-* IBM XL: 16.1.1
-* ROCm: 4.3.0
+For the complete documentation, click below:
 
-### Build system:
+# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki)
 
-* CMake >= 3.16: required
-* CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
-* CMake >= 3.21.1 for NVC++
-
-Primary tested compiler are passing in release mode
-with warnings as errors. They also are tested with a comprehensive set of
-backend combinations (i.e. OpenMP, Threads, Serial, OpenMP+Serial, ...).
-We are using the following set of flags:
-* GCC:
-   ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wignored-qualifiers -Wempty-body
-      -Wclobbered -Wuninitialized
-   ````
-* Intel:
-    ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wuninitialized
-    ````
-* Clang:
-    ````
-      -Wall -Wunused-parameter -Wshadow -pedantic
-      -Werror -Wsign-compare -Wtype-limits
-      -Wuninitialized
-    ````
-
-* NVCC:
-  ````
-    -Wall -Wunused-parameter -Wshadow -pedantic
-    -Werror -Wsign-compare -Wtype-limits
-    -Wuninitialized
-  ````
-
-Other compilers are tested occasionally, in particular when pushing from develop to
-master branch. These are tested less rigorously without `-Werror` and only for a select set of backends.
-
-# Building and Installing Kokkos
-Kokkos provide a CMake build system and a raw Makefile build system.
-The CMake build system is strongly encouraged and will be the most rigorously supported in future releases.
-Full details are given in the [build instructions](BUILD.md). Basic setups are shown here:
-
-## CMake
-
-The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`:
-````bash
-cmake $srcdir \
-  -DCMAKE_CXX_COMPILER=$path_to_compiler \
-  -DCMAKE_INSTALL_PREFIX=$path_to_install \
-  -DKokkos_ENABLE_OPENMP=On \
-  -DKokkos_ARCH_HSW=On \
-  -DKokkos_HWLOC_DIR=$path_to_hwloc
-````
-then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
-
-To validate the Kokkos build, configure with
-````
- -DKokkos_ENABLE_TESTS=On
-````
-and run `make test` after completing the build.
+# Learning about Kokkos
 
-For your CMake project using Kokkos, code such as the following:
+To start learning about Kokkos:
 
-````cmake
-find_package(Kokkos)
-...
-target_link_libraries(myTarget Kokkos::kokkos)
-````
-should be added to your CMakeLists.txt. Your configure should additionally include
-````
--DKokkos_DIR=$path_to_install/cmake/lib/Kokkos
-````
-or
-````
--DKokkos_ROOT=$path_to_install
-````
-for the install location given above.
+- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities.
 
-## Spack
-An alternative to manually building with the CMake is to use the Spack package manager.
-To get started, download the Spack [repo](https://github.com/spack/spack).
-````
-A basic installation would be done as:
-````bash
-> spack install kokkos
-````
-Spack allows options and and compilers to be tuned in the install command.
-````bash
-> spack install kokkos@3.0 %gcc@7.3.0 +openmp
-````
-This example illustrates the three most common parameters to Spack:
-* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
-* Version:  immediately following `kokkos` the `@version` can specify a particular Kokkos to build
-* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.
+- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch.
 
-For a complete list of Kokkos options, run:
-````bash
-> spack info kokkos
-````
-Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
-Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
-More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with:
-````bash
-> spack find -p kokkos ...
-````
-where `...` is the unique spec identifying the particular Kokkos configuration and version.
-Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
+- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html).
 
-## Raw Makefile
+- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability.
 
-Raw Makefiles are only supported via inline builds. See below.
+For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
 
-## Inline Builds vs. Installed Package
-For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
-The main reason is that you may otherwise need many different
-configurations of Kokkos installed depending on the required compile time
-features an application needs. For example there is only one default
-execution space, which means you need different installations to have OpenMP
-or C++ threads as the default space. Also for the CUDA backend there are certain
-choices, such as allowing relocatable device code, which must be made at
-installation time. Building Kokkos inline uses largely the same process
-as compiling an application against an installed Kokkos library.
+For non-public questions send an email to: *crtrott(at)sandia.gov*
 
-For CMake, this means copying over the Kokkos source code into your project and adding `add_subdirectory(kokkos)` to your CMakeLists.txt.
+# Contributing to Kokkos
 
-For raw Makefiles, see the example benchmarks/bytes_and_flops/Makefile which can be used with an installed library and or an inline build.
+Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute.
 
-# Kokkos and CUDA UVM
+# Requirements, Building and Installing
 
-Kokkos does support UVM as a specific memory space called CudaUVMSpace.
-Allocations made with that space are accessible from host and device.
-You can tell Kokkos to use that as the default space for Cuda allocations.
-In either case UVM comes with a number of restrictions:
-* You can't access allocations on the host while a kernel is potentially
-running. This will lead to segfaults. To avoid that you either need to
-call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or
-you can set the environment variable CUDA_LAUNCH_BLOCKING=1.
-* In multi socket multi GPU machines without NVLINK, UVM defaults
-to using zero copy allocations for technical reasons related to using multiple
-GPUs from the same process. If an executable doesn't do that (e.g. each
-MPI rank of an application uses a single GPU [can be the same GPU for
-multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1.
-This will enforce proper UVM allocations, but can lead to errors if
-more than a single GPU is used by a single process.
+All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html).
 
+Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html).
 
 # Citing Kokkos
 
-If you publish work which mentions Kokkos, please cite the following paper:
-
-````BibTex
-@ARTICLE{9485033,
-  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
-  journal={IEEE Transactions on Parallel and Distributed Systems},
-  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
-  year={2022},
-  volume={33},
-  number={4},
-  pages={805-817},
-  doi={10.1109/TPDS.2021.3097283}}
-````
-
-If you use more than one Kokkos EcoSystem package, please also cite:
-
-````BibTex
-@ARTICLE{9502936,
-  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
-  journal={Computing in Science   Engineering},
-  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
-  year={2021},
-  volume={23},
-  number={5},
-  pages={10-18},
-  doi={10.1109/MCSE.2021.3098509}}
-````
-
-
-And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
-
-````BibTeX
-@article{CarterEdwards20143202,
-  title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
-  journal = "Journal of Parallel and Distributed Computing ",
-  volume = "74",
-  number = "12",
-  pages = "3202 - 3216",
-  year = "2014",
-  note = "Domain-Specific Languages and High-Level Frameworks for High-Performance Computing ",
-  issn = "0743-7315",
-  doi = "https://doi.org/10.1016/j.jpdc.2014.07.003",
-  url = "http://www.sciencedirect.com/science/article/pii/S0743731514001257",
-  author = "H. Carter Edwards and Christian R. Trott and Daniel Sunderland"
-}
-````
+Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html).
 
-##### [LICENSE](https://github.com/kokkos/kokkos/blob/master/LICENSE)
+# License
 
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
 Under the terms of Contract DE-NA0003525 with NTESS,
 the U.S. Government retains certain rights in this software.
 
+The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or
+[here](https://github.com/kokkos/kokkos/blob/master/LICENSE).
diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt
index 4b60d887f..597626b11 100644
--- a/packages/kokkos/algorithms/src/CMakeLists.txt
+++ b/packages/kokkos/algorithms/src/CMakeLists.txt
@@ -11,6 +11,7 @@ FILE(GLOB ALGO_HEADERS *.hpp)
 FILE(GLOB ALGO_SOURCES *.cpp)
 LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp)
+APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp)
 
 INSTALL (
   DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 59c11afd9..1d85ffdfb 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_RANDOM_HPP
 #define KOKKOS_RANDOM_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Complex.hpp>
@@ -648,63 +652,44 @@ struct Random_UniqueIndex {
   }
 };
 
-#ifdef KOKKOS_ENABLE_CUDA
-template <class MemorySpace>
-struct Random_UniqueIndex<Kokkos::Device<Kokkos::Cuda, MemorySpace>> {
-  using locks_view_type =
-      View<int**, Kokkos::Device<Kokkos::Cuda, MemorySpace>>;
-  KOKKOS_FUNCTION
-  static int get_state_idx(const locks_view_type& locks_) {
-#ifdef __CUDA_ARCH__
-    const int i_offset =
-        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
-                 blockDim.x * blockDim.y * blockDim.z +
-             i_offset) %
-            locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
-      i += blockDim.x * blockDim.y * blockDim.z;
-      if (i >= static_cast<int>(locks_.extent(0))) {
-        i = i_offset;
-      }
-    }
-    return i;
-#else
-    (void)locks_;
-    return 0;
-#endif
-  }
-};
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Cuda
+#elif defined(KOKKOS_ENABLE_HIP)
+#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Experimental::HIP
 #endif
 
-#ifdef KOKKOS_ENABLE_HIP
 template <class MemorySpace>
 struct Random_UniqueIndex<
-    Kokkos::Device<Kokkos::Experimental::HIP, MemorySpace>> {
+    Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP, MemorySpace>> {
   using locks_view_type =
-      View<int**, Kokkos::Device<Kokkos::Experimental::HIP, MemorySpace>>;
+      View<int**, Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP,
+                                 MemorySpace>>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
-#ifdef __HIP_DEVICE_COMPILE__
-    const int i_offset =
-        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
+    KOKKOS_IF_ON_DEVICE((
+        const int i_offset =
+            (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
+        int i =
+            (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
-      i += blockDim.x * blockDim.y * blockDim.z;
-      if (i >= static_cast<int>(locks_.extent(0))) {
-        i = i_offset;
-      }
-    }
-    return i;
-#else
-    (void)locks_;
-    return 0;
-#endif
+        while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
+          i += blockDim.x * blockDim.y * blockDim.z;
+          if (i >= static_cast<int>(locks_.extent(0))) {
+            i = i_offset;
+          }
+        }
+
+        return i;))
+    KOKKOS_IF_ON_HOST(((void)locks_; return 0;))
   }
 };
+
+#undef KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP
+
 #endif
 
 #ifdef KOKKOS_ENABLE_SYCL
@@ -1279,7 +1264,6 @@ struct fill_random_functor_begin_end;
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 0,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1303,7 +1287,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 0,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1331,7 +1314,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1361,7 +1343,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1392,7 +1373,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1424,7 +1404,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1458,7 +1437,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1494,7 +1472,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1532,7 +1509,6 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
                                      IndexType> {
-  using execution_space = typename ViewType::execution_space;
   ViewType a;
   RandomPool rand_pool;
   typename ViewType::const_value_type begin, end;
@@ -1569,34 +1545,57 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
   }
 };
 
-template <class ViewType, class RandomPool, class IndexType = int64_t>
-void fill_random(ViewType a, RandomPool g,
+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
                  typename ViewType::const_value_type begin,
                  typename ViewType::const_value_type end) {
   int64_t LDA = a.extent(0);
   if (LDA > 0)
-    parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
-                 Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
-                                                     ViewType::Rank, IndexType>(
-                     a, g, begin, end));
+    parallel_for(
+        "Kokkos::fill_random",
+        Kokkos::RangePolicy<ExecutionSpace>(exec, 0, (LDA + 127) / 128),
+        Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
+                                            ViewType::Rank, IndexType>(
+            a, g, begin, end));
 }
 
 }  // namespace Impl
 
+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
+                 typename ViewType::const_value_type begin,
+                 typename ViewType::const_value_type end) {
+  Impl::apply_to_view_of_static_rank(
+      [&](auto dst) { Kokkos::Impl::fill_random(exec, dst, g, begin, end); },
+      a);
+}
+
+template <class ExecutionSpace, class ViewType, class RandomPool,
+          class IndexType = int64_t>
+void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
+                 typename ViewType::const_value_type range) {
+  fill_random(exec, a, g, 0, range);
+}
+
 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                  typename ViewType::const_value_type begin,
                  typename ViewType::const_value_type end) {
-  Impl::apply_to_view_of_static_rank(
-      [&](auto dst) { Kokkos::Impl::fill_random(dst, g, begin, end); }, a);
+  fill_random(typename ViewType::execution_space{}, a, g, begin, end);
 }
 
 template <class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g,
                  typename ViewType::const_value_type range) {
-  fill_random(a, g, 0, range);
+  fill_random(typename ViewType::execution_space{}, a, g, 0, range);
 }
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
+#endif
 #endif
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index ce97de9b7..ad0c2d47b 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_SORT_HPP_
 #define KOKKOS_SORT_HPP_
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#endif
 
 #include <Kokkos_Core.hpp>
 
@@ -120,13 +124,13 @@ class BinSort {
     // If a Kokkos::View then can generate constant random access
     // otherwise can only use the constant type.
 
-    using src_view_type = typename std::conditional<
+    using src_view_type = std::conditional_t<
         Kokkos::is_view<SrcViewType>::value,
         Kokkos::View<typename SrcViewType::const_data_type,
                      typename SrcViewType::array_layout,
                      typename SrcViewType::device_type,
                      Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-        typename SrcViewType::const_type>::type;
+        typename SrcViewType::const_type>;
 
     using perm_view_type = typename PermuteViewType::const_type;
 
@@ -151,8 +155,11 @@ class BinSort {
     }
   };
 
-  using execution_space = typename Space::execution_space;
-  using bin_op_type     = BinSortOp;
+  // Naming this alias "execution_space" would be problematic since it would be
+  // considered as execution space for the various functors which might use
+  // another execution space through sort() or create_permute_vector().
+  using exec_space  = typename Space::execution_space;
+  using bin_op_type = BinSortOp;
 
   struct bin_count_tag {};
   struct bin_offset_tag {};
@@ -171,13 +178,13 @@ class BinSort {
   // If a Kokkos::View then can generate constant random access
   // otherwise can only use the constant type.
 
-  using const_rnd_key_view_type = typename std::conditional<
+  using const_rnd_key_view_type = std::conditional_t<
       Kokkos::is_view<KeyViewType>::value,
       Kokkos::View<typename KeyViewType::const_data_type,
                    typename KeyViewType::array_layout,
                    typename KeyViewType::device_type,
                    Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-      const_key_view_type>::type;
+      const_key_view_type>;
 
   using non_const_key_scalar = typename KeyViewType::non_const_value_type;
   using const_key_scalar     = typename KeyViewType::const_value_type;
@@ -220,6 +227,14 @@ class BinSort {
         range_begin(range_begin_),
         range_end(range_end_),
         sort_within_bins(sort_within_bins_) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    if (bin_op.max_bins() <= 0)
+      Kokkos::abort(
+          "The number of bins in the BinSortOp object must be greater than 0!");
     bin_count_atomic = Kokkos::View<int*, Space>(
         "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
     bin_count_const = bin_count_atomic;
@@ -235,7 +250,7 @@ class BinSort {
 
   BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
           BinSortOp bin_op_, bool sort_within_bins_ = false)
-      : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
+      : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_,
                 sort_within_bins_) {}
 
   template <typename ExecutionSpace>
@@ -245,13 +260,19 @@ class BinSort {
 
   BinSort(const_key_view_type keys_, BinSortOp bin_op_,
           bool sort_within_bins_ = false)
-      : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}
+      : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {}
 
   //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count
   // array. Can be called again if keys changed
-  template <class ExecutionSpace = execution_space>
-  void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
+  template <class ExecutionSpace = exec_space>
+  void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+
     const size_t len = range_end - range_begin;
     Kokkos::parallel_for(
         "Kokkos::Sort::BinCount",
@@ -281,6 +302,17 @@ class BinSort {
   template <class ExecutionSpace, class ValuesViewType>
   void sort(const ExecutionSpace& exec, ValuesViewType const& values,
             int values_range_begin, int values_range_end) const {
+    static_assert(
+        Kokkos::SpaceAccessibility<ExecutionSpace,
+                                   typename Space::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "BinSort was initialized with!");
+    static_assert(
+        Kokkos::SpaceAccessibility<
+            ExecutionSpace, typename ValuesViewType::memory_space>::accessible,
+        "The provided execution space must be able to access the memory space "
+        "of the View argument!");
+
     using scratch_view_type =
         Kokkos::View<typename ValuesViewType::data_type,
                      typename ValuesViewType::array_layout,
@@ -340,7 +372,7 @@ class BinSort {
   template <class ValuesViewType>
   void sort(ValuesViewType const& values, int values_range_begin,
             int values_range_end) const {
-    execution_space exec;
+    exec_space exec;
     sort(exec, values, values_range_begin, values_range_end);
     exec.fence("Kokkos::Sort: fence after sorting");
   }
@@ -428,7 +460,7 @@ struct BinOp1D {
 
   BinOp1D() = default;
 
-  // Construct BinOp with number of bins, minimum value and maxuimum value
+  // Construct BinOp with number of bins, minimum value and maximum value
   BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
           typename KeyViewType::const_value_type max)
       : max_bins_(max_bins__ + 1),
@@ -554,11 +586,7 @@ struct min_max_functor {
 
 template <class ExecutionSpace, class ViewType>
 std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
-    const ExecutionSpace& exec, ViewType const& view,
-    bool const always_use_kokkos_sort = false) {
-  if (!always_use_kokkos_sort) {
-    if (Impl::try_std_sort(view, exec)) return;
-  }
+    const ExecutionSpace& exec, ViewType const& view) {
   using CompType = BinOp1D<ViewType>;
 
   Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
@@ -596,12 +624,38 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
   bin_sort.sort(exec, view);
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class ExecutionSpace, class ViewType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload not taking bool always_use_kokkos_sort")
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType const& view,
+    bool const always_use_kokkos_sort) {
+  if (!always_use_kokkos_sort && Impl::try_std_sort(view, exec)) {
+    return;
+  } else {
+    sort(exec, view);
+  }
+}
+#endif
+
+template <class ViewType>
+void sort(ViewType const& view) {
+  typename ViewType::execution_space exec;
+  sort(exec, view);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <class ViewType>
-void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload not taking bool always_use_kokkos_sort")
+void sort(ViewType const& view, bool const always_use_kokkos_sort) {
   typename ViewType::execution_space exec;
   sort(exec, view, always_use_kokkos_sort);
   exec.fence("Kokkos::Sort: fence after sorting");
 }
+#endif
 
 template <class ExecutionSpace, class ViewType>
 std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
@@ -635,4 +689,8 @@ void sort(ViewType view, size_t const begin, size_t const end) {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT
+#endif
 #endif
diff --git a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
index 2e3babbcf..3e0f731cf 100644
--- a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp
@@ -44,59 +44,103 @@
 
 #ifndef KOKKOS_STD_ALGORITHMS_HPP
 #define KOKKOS_STD_ALGORITHMS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#endif
 
 /// \file Kokkos_StdAlgorithms.hpp
 /// \brief Kokkos counterparts for Standard C++ Library algorithms
 
-#include <std_algorithms/Kokkos_Constraints.hpp>
-#include <std_algorithms/Kokkos_RandomAccessIterator.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
+#include "std_algorithms/impl/Kokkos_Constraints.hpp"
+#include "std_algorithms/impl/Kokkos_RandomAccessIterator.hpp"
+#include "std_algorithms/Kokkos_BeginEnd.hpp"
 
 // distance
-#include <std_algorithms/Kokkos_Distance.hpp>
+#include "std_algorithms/Kokkos_Distance.hpp"
+
+// note that we categorize below the headers
+// following the std classification.
 
-// move, swap, iter_swap
-#include "std_algorithms/Kokkos_ModifyingOperations.hpp"
+// modifying ops
+#include "std_algorithms/Kokkos_Swap.hpp"
+#include "std_algorithms/Kokkos_IterSwap.hpp"
 
-// find, find_if, find_if_not
-// for_each, for_each_n
-// mismatch
-// equal
-// count_if, count
-// all_of, any_of, none_of
-// adjacent_find
-// lexicographical_compare
-// search, search_n
-// find_first_of, find_end
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
+// non-modifying sequence
+#include "std_algorithms/Kokkos_AdjacentFind.hpp"
+#include "std_algorithms/Kokkos_Count.hpp"
+#include "std_algorithms/Kokkos_CountIf.hpp"
+#include "std_algorithms/Kokkos_AllOf.hpp"
+#include "std_algorithms/Kokkos_AnyOf.hpp"
+#include "std_algorithms/Kokkos_NoneOf.hpp"
+#include "std_algorithms/Kokkos_Equal.hpp"
+#include "std_algorithms/Kokkos_Find.hpp"
+#include "std_algorithms/Kokkos_FindIf.hpp"
+#include "std_algorithms/Kokkos_FindIfNot.hpp"
+#include "std_algorithms/Kokkos_FindEnd.hpp"
+#include "std_algorithms/Kokkos_FindFirstOf.hpp"
+#include "std_algorithms/Kokkos_ForEach.hpp"
+#include "std_algorithms/Kokkos_ForEachN.hpp"
+#include "std_algorithms/Kokkos_LexicographicalCompare.hpp"
+#include "std_algorithms/Kokkos_Mismatch.hpp"
+#include "std_algorithms/Kokkos_Search.hpp"
+#include "std_algorithms/Kokkos_SearchN.hpp"
 
-// replace, replace_copy_if, replace_copy, replace_if
-// copy, copy_n, copy_backward, copy_if
-// fill, fill_n
-// transform
-// generate, generate_n
-// reverse, reverse_copy
-// move, move_backward
-// swap_ranges
-// unique, unique_copy
-// rotate, rotate_copy
-// remove, remove_if, remove_copy, remove_copy_if
-// shift_left, shift_right
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
+// modifying sequence
+#include "std_algorithms/Kokkos_Fill.hpp"
+#include "std_algorithms/Kokkos_FillN.hpp"
+#include "std_algorithms/Kokkos_Replace.hpp"
+#include "std_algorithms/Kokkos_ReplaceIf.hpp"
+#include "std_algorithms/Kokkos_ReplaceCopyIf.hpp"
+#include "std_algorithms/Kokkos_ReplaceCopy.hpp"
+#include "std_algorithms/Kokkos_Copy.hpp"
+#include "std_algorithms/Kokkos_CopyN.hpp"
+#include "std_algorithms/Kokkos_CopyBackward.hpp"
+#include "std_algorithms/Kokkos_CopyIf.hpp"
+#include "std_algorithms/Kokkos_Transform.hpp"
+#include "std_algorithms/Kokkos_Generate.hpp"
+#include "std_algorithms/Kokkos_GenerateN.hpp"
+#include "std_algorithms/Kokkos_Reverse.hpp"
+#include "std_algorithms/Kokkos_ReverseCopy.hpp"
+#include "std_algorithms/Kokkos_Move.hpp"
+#include "std_algorithms/Kokkos_MoveBackward.hpp"
+#include "std_algorithms/Kokkos_SwapRanges.hpp"
+#include "std_algorithms/Kokkos_Unique.hpp"
+#include "std_algorithms/Kokkos_UniqueCopy.hpp"
+#include "std_algorithms/Kokkos_Rotate.hpp"
+#include "std_algorithms/Kokkos_RotateCopy.hpp"
+#include "std_algorithms/Kokkos_Remove.hpp"
+#include "std_algorithms/Kokkos_RemoveIf.hpp"
+#include "std_algorithms/Kokkos_RemoveCopy.hpp"
+#include "std_algorithms/Kokkos_RemoveCopyIf.hpp"
+#include "std_algorithms/Kokkos_ShiftLeft.hpp"
+#include "std_algorithms/Kokkos_ShiftRight.hpp"
 
-// is_sorted_until, is_sorted
-#include <std_algorithms/Kokkos_SortingOperations.hpp>
+// sorting
+#include "std_algorithms/Kokkos_IsSortedUntil.hpp"
+#include "std_algorithms/Kokkos_IsSorted.hpp"
 
-// min_element, max_element, minmax_element
-#include <std_algorithms/Kokkos_MinMaxElementOperations.hpp>
+// min/max element
+#include "std_algorithms/Kokkos_MinElement.hpp"
+#include "std_algorithms/Kokkos_MaxElement.hpp"
+#include "std_algorithms/Kokkos_MinMaxElement.hpp"
 
-// is_partitioned, partition_copy, partition_point
-#include <std_algorithms/Kokkos_PartitioningOperations.hpp>
+// partitioning
+#include "std_algorithms/Kokkos_IsPartitioned.hpp"
+#include "std_algorithms/Kokkos_PartitionCopy.hpp"
+#include "std_algorithms/Kokkos_PartitionPoint.hpp"
 
-// adjacent_difference
-// reduce, transform_reduce
-// exclusive_scan, transform_exclusive_scan
-// inclusive_scan, transform_inclusive_scan
-#include <std_algorithms/Kokkos_Numeric.hpp>
+// numeric
+#include "std_algorithms/Kokkos_AdjacentDifference.hpp"
+#include "std_algorithms/Kokkos_Reduce.hpp"
+#include "std_algorithms/Kokkos_TransformReduce.hpp"
+#include "std_algorithms/Kokkos_ExclusiveScan.hpp"
+#include "std_algorithms/Kokkos_TransformExclusiveScan.hpp"
+#include "std_algorithms/Kokkos_InclusiveScan.hpp"
+#include "std_algorithms/Kokkos_TransformInclusiveScan.hpp"
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STD_ALGORITHMS
+#endif
 #endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_AdjacentDifference.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
similarity index 72%
rename from packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_AdjacentDifference.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
index 03e5fd6ae..0a7cf06f5 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_AdjacentDifference.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp
@@ -42,106 +42,15 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_NUMERICS_ADJACENT_DIFFERENCE_HPP
-#define KOKKOS_STD_NUMERICS_ADJACENT_DIFFERENCE_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_ADJACENT_DIFFERENCE_HPP
+#define KOKKOS_STD_ALGORITHMS_ADJACENT_DIFFERENCE_HPP
 
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_Distance.hpp"
+#include "impl/Kokkos_AdjacentDifference.hpp"
+#include "Kokkos_BeginEnd.hpp"
 
 namespace Kokkos {
 namespace Experimental {
-namespace Impl {
 
-// ------------------------
-//
-// functors
-//
-// ------------------------
-template <class ValueType1, class ValueType2, class RetType = ValueType2>
-struct StdAdjacentDifferenceDefaultBinaryOpFunctor {
-  KOKKOS_FUNCTION
-  constexpr RetType operator()(const ValueType1& a, const ValueType2& b) const {
-    return a - b;
-  }
-};
-
-template <class InputIteratorType, class OutputIteratorType,
-          class BinaryOperator>
-struct StdAdjacentDiffFunctor {
-  using index_type = typename InputIteratorType::difference_type;
-
-  const InputIteratorType m_first_from;
-  const OutputIteratorType m_first_dest;
-  BinaryOperator m_op;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i) const {
-    const auto& my_value = m_first_from[i];
-    if (i == 0) {
-      m_first_dest[i] = my_value;
-    } else {
-      const auto& left_value = m_first_from[i - 1];
-      m_first_dest[i]        = m_op(my_value, left_value);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdAdjacentDiffFunctor(InputIteratorType first_from,
-                         OutputIteratorType first_dest, BinaryOperator op)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_op(std::move(op)) {}
-};
-
-// ------------------------------------------
-// adjacent_difference_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-OutputIteratorType adjacent_difference_impl(const std::string& label,
-                                            const ExecutionSpace& ex,
-                                            InputIteratorType first_from,
-                                            InputIteratorType last_from,
-                                            OutputIteratorType first_dest,
-                                            BinaryOp bin_op) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  if (first_from == last_from) {
-    return first_dest;
-  }
-
-  // aliases
-  using value_type    = typename OutputIteratorType::value_type;
-  using aux_view_type = ::Kokkos::View<value_type*, ExecutionSpace>;
-  using functor_t =
-      StdAdjacentDiffFunctor<InputIteratorType, OutputIteratorType, BinaryOp>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  aux_view_type aux_view("aux_view", num_elements);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         functor_t(first_from, first_dest, bin_op));
-  ex.fence("Kokkos::adjacent_difference: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-}  // end namespace Impl
-
-// ------------------------
-//
-// public API
-//
-// ------------------------
 template <class ExecutionSpace, class InputIteratorType,
           class OutputIteratorType>
 std::enable_if_t<!::Kokkos::is_view<InputIteratorType>::value,
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
new file mode 100644
index 000000000..332f9dd36
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ADJACENT_FIND_HPP
+#define KOKKOS_STD_ALGORITHMS_ADJACENT_FIND_HPP
+
+#include "impl/Kokkos_AdjacentFind.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set1
+template <class ExecutionSpace, class IteratorType>
+IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
+                           IteratorType last) {
+  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
+                                  ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last) {
+  return Impl::adjacent_find_impl(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto adjacent_find(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
+                                  KE::begin(v), KE::end(v));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v));
+}
+
+// overload set2
+template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
+IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
+                           IteratorType last, BinaryPredicateType pred) {
+  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
+                                  ex, first, last, pred);
+}
+
+template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
+IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last,
+                           BinaryPredicateType pred) {
+  return Impl::adjacent_find_impl(label, ex, first, last, pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class BinaryPredicateType>
+auto adjacent_find(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType, Properties...>& v,
+                   BinaryPredicateType pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
+                                  KE::begin(v), KE::end(v), pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class BinaryPredicateType>
+auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType, Properties...>& v,
+                   BinaryPredicateType pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
new file mode 100644
index 000000000..66a49541f
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AllOf.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ALL_OF_HPP
+#define KOKKOS_STD_ALGORITHMS_ALL_OF_HPP
+
+#include "impl/Kokkos_AllOfAnyOfNoneOf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool all_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
+            Predicate predicate) {
+  return Impl::all_of_impl("Kokkos::all_of_iterator_api_default", ex, first,
+                           last, predicate);
+}
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool all_of(const std::string& label, const ExecutionSpace& ex,
+            InputIterator first, InputIterator last, Predicate predicate) {
+  return Impl::all_of_impl(label, ex, first, last, predicate);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool all_of(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& v,
+            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::all_of_impl("Kokkos::all_of_view_api_default", ex, KE::cbegin(v),
+                           KE::cend(v), std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool all_of(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& v,
+            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::all_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                           std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
new file mode 100644
index 000000000..e50e90f6d
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ANY_OF_HPP
+#define KOKKOS_STD_ALGORITHMS_ANY_OF_HPP
+
+#include "impl/Kokkos_AllOfAnyOfNoneOf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool any_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
+            Predicate predicate) {
+  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, first, last,
+                           predicate);
+}
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool any_of(const std::string& label, const ExecutionSpace& ex,
+            InputIterator first, InputIterator last, Predicate predicate) {
+  return Impl::any_of_impl(label, ex, first, last, predicate);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool any_of(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& v,
+            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, KE::cbegin(v),
+                           KE::cend(v), std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool any_of(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& v,
+            Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::any_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                           std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_BeginEnd.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_BeginEnd.hpp
index beb53fdd7..544919619 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_BeginEnd.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_BeginEnd.hpp
@@ -46,8 +46,8 @@
 #define KOKKOS_BEGIN_END_HPP
 
 #include <Kokkos_View.hpp>
-#include "Kokkos_RandomAccessIterator.hpp"
-#include "Kokkos_Constraints.hpp"
+#include "impl/Kokkos_RandomAccessIterator.hpp"
+#include "impl/Kokkos_Constraints.hpp"
 
 /// \file Kokkos_BeginEnd.hpp
 /// \brief Kokkos begin, end, cbegin, cend
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
new file mode 100644
index 000000000..b3237041b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp
@@ -0,0 +1,97 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_HPP
+
+#include "impl/Kokkos_CopyCopyN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator copy(const ExecutionSpace& ex, InputIterator first,
+                    InputIterator last, OutputIterator d_first) {
+  return Impl::copy_impl("Kokkos::copy_iterator_api_default", ex, first, last,
+                         d_first);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator copy(const std::string& label, const ExecutionSpace& ex,
+                    InputIterator first, InputIterator last,
+                    OutputIterator d_first) {
+  return Impl::copy_impl(label, ex, first, last, d_first);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto copy(const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType1, Properties1...>& source,
+          ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_impl("Kokkos::copy_view_api_default", ex,
+                         KE::cbegin(source), KE::cend(source), KE::begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto copy(const std::string& label, const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType1, Properties1...>& source,
+          ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_impl(label, ex, KE::cbegin(source), KE::cend(source),
+                         KE::begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
new file mode 100644
index 000000000..83efd9667
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_BACKWARD_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_BACKWARD_HPP
+
+#include "impl/Kokkos_CopyBackward.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 copy_backward(const ExecutionSpace& ex, IteratorType1 first,
+                            IteratorType1 last, IteratorType2 d_last) {
+  return Impl::copy_backward_impl("Kokkos::copy_backward_iterator_api_default",
+                                  ex, first, last, d_last);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 copy_backward(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 d_last) {
+  return Impl::copy_backward_impl(label, ex, first, last, d_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto copy_backward(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& source,
+                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_backward_impl("Kokkos::copy_backward_view_api_default", ex,
+                                  cbegin(source), cend(source), end(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto copy_backward(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& source,
+                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_backward_impl(label, ex, cbegin(source), cend(source),
+                                  end(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
new file mode 100644
index 000000000..c83cc2988
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_IF_HPP
+
+#include "impl/Kokkos_CopyIf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class Predicate>
+OutputIterator copy_if(const ExecutionSpace& ex, InputIterator first,
+                       InputIterator last, OutputIterator d_first,
+                       Predicate pred) {
+  return Impl::copy_if_impl("Kokkos::copy_if_iterator_api_default", ex, first,
+                            last, d_first, std::move(pred));
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class Predicate>
+OutputIterator copy_if(const std::string& label, const ExecutionSpace& ex,
+                       InputIterator first, InputIterator last,
+                       OutputIterator d_first, Predicate pred) {
+  return Impl::copy_if_impl(label, ex, first, last, d_first, std::move(pred));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class Predicate>
+auto copy_if(const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType1, Properties1...>& source,
+             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_if_impl("Kokkos::copy_if_view_api_default", ex,
+                            cbegin(source), cend(source), begin(dest),
+                            std::move(pred));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class Predicate>
+auto copy_if(const std::string& label, const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType1, Properties1...>& source,
+             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::copy_if_impl(label, ex, cbegin(source), cend(source),
+                            begin(dest), std::move(pred));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
new file mode 100644
index 000000000..7f3b9374c
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp
@@ -0,0 +1,98 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_N_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_N_HPP
+
+#include "impl/Kokkos_CopyCopyN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class Size,
+          class OutputIterator>
+OutputIterator copy_n(const ExecutionSpace& ex, InputIterator first, Size count,
+                      OutputIterator result) {
+  return Impl::copy_n_impl("Kokkos::copy_n_iterator_api_default", ex, first,
+                           count, result);
+}
+
+template <class ExecutionSpace, class InputIterator, class Size,
+          class OutputIterator>
+OutputIterator copy_n(const std::string& label, const ExecutionSpace& ex,
+                      InputIterator first, Size count, OutputIterator result) {
+  return Impl::copy_n_impl(label, ex, first, count, result);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class Size, class DataType2, class... Properties2>
+auto copy_n(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
+            ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_n_impl("Kokkos::copy_n_view_api_default", ex,
+                           KE::cbegin(source), count, KE::begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class Size, class DataType2, class... Properties2>
+auto copy_n(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
+            ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::copy_n_impl(label, ex, KE::cbegin(source), count,
+                           KE::begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
new file mode 100644
index 000000000..a885ee4ad
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Count.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COUNT_HPP
+#define KOKKOS_STD_ALGORITHMS_COUNT_HPP
+
+#include "impl/Kokkos_CountCountIf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class T>
+typename IteratorType::difference_type count(const ExecutionSpace& ex,
+                                             IteratorType first,
+                                             IteratorType last,
+                                             const T& value) {
+  return Impl::count_impl("Kokkos::count_iterator_api_default", ex, first, last,
+                          value);
+}
+
+template <class ExecutionSpace, class IteratorType, class T>
+typename IteratorType::difference_type count(const std::string& label,
+                                             const ExecutionSpace& ex,
+                                             IteratorType first,
+                                             IteratorType last,
+                                             const T& value) {
+  return Impl::count_impl(label, ex, first, last, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+auto count(const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_impl("Kokkos::count_view_api_default", ex, KE::cbegin(v),
+                          KE::cend(v), value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+auto count(const std::string& label, const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_impl(label, ex, KE::cbegin(v), KE::cend(v), value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
new file mode 100644
index 000000000..98b9d74c4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CountIf.hpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COUNT_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_COUNT_IF_HPP
+
+#include "impl/Kokkos_CountCountIf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+typename IteratorType::difference_type count_if(const ExecutionSpace& ex,
+                                                IteratorType first,
+                                                IteratorType last,
+                                                Predicate predicate) {
+  return Impl::count_if_impl("Kokkos::count_if_iterator_api_default", ex, first,
+                             last, std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+typename IteratorType::difference_type count_if(const std::string& label,
+                                                const ExecutionSpace& ex,
+                                                IteratorType first,
+                                                IteratorType last,
+                                                Predicate predicate) {
+  return Impl::count_if_impl(label, ex, first, last, std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto count_if(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& v,
+              Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_if_impl("Kokkos::count_if_view_api_default", ex,
+                             KE::cbegin(v), KE::cend(v), std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto count_if(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& v,
+              Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::count_if_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                             std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Distance.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Distance.hpp
index ced437047..4e148642b 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Distance.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Distance.hpp
@@ -45,8 +45,8 @@
 #ifndef KOKKOS_STD_ALGORITHMS_DISTANCE_HPP
 #define KOKKOS_STD_ALGORITHMS_DISTANCE_HPP
 
-#include "Kokkos_Constraints.hpp"
-#include "Kokkos_RandomAccessIterator.hpp"
+#include "impl/Kokkos_Constraints.hpp"
+#include "impl/Kokkos_RandomAccessIterator.hpp"
 
 namespace Kokkos {
 namespace Experimental {
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
new file mode 100644
index 000000000..8634019fa
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp
@@ -0,0 +1,198 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_EQUAL_HPP
+#define KOKKOS_STD_ALGORITHMS_EQUAL_HPP
+
+#include "impl/Kokkos_Equal.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+      IteratorType2 first2) {
+  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
+                          last1, first2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+      IteratorType1 last1, IteratorType2 first2) {
+  return Impl::equal_impl(label, ex, first1, last1, first2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+      IteratorType2 first2, BinaryPredicateType predicate) {
+  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
+                          last1, first2, std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+      IteratorType1 last1, IteratorType2 first2,
+      BinaryPredicateType predicate) {
+  return Impl::equal_impl(label, ex, first1, last1, first2,
+                          std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+bool equal(const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType1, Properties1...>& view1,
+           ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
+                          KE::cbegin(view1), KE::cend(view1),
+                          KE::cbegin(view2));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType1, Properties1...>& view1,
+           ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
+                          KE::cbegin(view2));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+bool equal(const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType1, Properties1...>& view1,
+           ::Kokkos::View<DataType2, Properties2...>& view2,
+           BinaryPredicateType predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
+                          KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
+                          std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+bool equal(const std::string& label, const ExecutionSpace& ex,
+           const ::Kokkos::View<DataType1, Properties1...>& view1,
+           ::Kokkos::View<DataType2, Properties2...>& view2,
+           BinaryPredicateType predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
+                          KE::cbegin(view2), std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+      IteratorType2 first2, IteratorType2 last2) {
+  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
+                          last1, first2, last2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
+  return Impl::equal_impl(label, ex, first1, last1, first2, last2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+      IteratorType2 first2, IteratorType2 last2,
+      BinaryPredicateType predicate) {
+  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
+                          last1, first2, last2, std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      IteratorType1, IteratorType2>::value,
+                  bool>
+equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
+      BinaryPredicateType predicate) {
+  return Impl::equal_impl(label, ex, first1, last1, first2, last2,
+                          std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
new file mode 100644
index 000000000..b97710f24
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_EXCLUSIVE_SCAN_HPP
+#define KOKKOS_STD_ALGORITHMS_EXCLUSIVE_SCAN_HPP
+
+#include "impl/Kokkos_ExclusiveScan.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               ValueType init_value) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_default_op_impl(
+      "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last,
+      first_dest, init_value);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+exclusive_scan(const std::string& label, const ExecutionSpace& ex,
+               InputIteratorType first, InputIteratorType last,
+               OutputIteratorType first_dest, ValueType init_value) {
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_default_op_impl(label, ex, first, last,
+                                              first_dest, init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto exclusive_scan(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_default_op_impl(
+      "Kokkos::exclusive_scan_default_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
+                                              KE::cend(view_from),
+                                              KE::begin(view_dest), init_value);
+}
+
+// overload set 2
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               ValueType init_value, BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_custom_op_impl(
+      "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last,
+      first_dest, init_value, bop);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+exclusive_scan(const std::string& label, const ExecutionSpace& ex,
+               InputIteratorType first, InputIteratorType last,
+               OutputIteratorType first_dest, ValueType init_value,
+               BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::exclusive_scan_custom_op_impl(label, ex, first, last, first_dest,
+                                             init_value, bop);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType,
+          class BinaryOpType>
+auto exclusive_scan(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    ValueType init_value, BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_custom_op_impl(
+      "Kokkos::exclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      init_value, bop);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType,
+          class BinaryOpType>
+auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    ValueType init_value, BinaryOpType bop) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::exclusive_scan_custom_op_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), init_value, bop);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
new file mode 100644
index 000000000..200e03b9d
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Fill.hpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FILL_HPP
+#define KOKKOS_STD_ALGORITHMS_FILL_HPP
+
+#include "impl/Kokkos_FillFillN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class T>
+void fill(const ExecutionSpace& ex, IteratorType first, IteratorType last,
+          const T& value) {
+  Impl::fill_impl("Kokkos::fill_iterator_api_default", ex, first, last, value);
+}
+
+template <class ExecutionSpace, class IteratorType, class T>
+void fill(const std::string& label, const ExecutionSpace& ex,
+          IteratorType first, IteratorType last, const T& value) {
+  Impl::fill_impl(label, ex, first, last, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+void fill(const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  Impl::fill_impl("Kokkos::fill_view_api_default", ex, begin(view), end(view),
+                  value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+void fill(const std::string& label, const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  Impl::fill_impl(label, ex, begin(view), end(view), value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
new file mode 100644
index 000000000..2e814dc55
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FillN.hpp
@@ -0,0 +1,91 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FILL_N_HPP
+#define KOKKOS_STD_ALGORITHMS_FILL_N_HPP
+
+#include "impl/Kokkos_FillFillN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class SizeType, class T>
+IteratorType fill_n(const ExecutionSpace& ex, IteratorType first, SizeType n,
+                    const T& value) {
+  return Impl::fill_n_impl("Kokkos::fill_n_iterator_api_default", ex, first, n,
+                           value);
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType, class T>
+IteratorType fill_n(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, SizeType n, const T& value) {
+  return Impl::fill_n_impl(label, ex, first, n, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class T>
+auto fill_n(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
+            const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::fill_n_impl("Kokkos::fill_n_view_api_default", ex, begin(view),
+                           n, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class T>
+auto fill_n(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
+            const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::fill_n_impl(label, ex, begin(view), n, value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
new file mode 100644
index 000000000..6758f00ce
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Find.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_HPP
+
+#include "impl/Kokkos_FindIfOrNot.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class T>
+InputIterator find(const ExecutionSpace& ex, InputIterator first,
+                   InputIterator last, const T& value) {
+  return Impl::find_impl("Kokkos::find_iterator_api_default", ex, first, last,
+                         value);
+}
+
+template <class ExecutionSpace, class InputIterator, class T>
+InputIterator find(const std::string& label, const ExecutionSpace& ex,
+                   InputIterator first, InputIterator last, const T& value) {
+  return Impl::find_impl(label, ex, first, last, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+auto find(const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_impl("Kokkos::find_view_api_default", ex, KE::begin(view),
+                         KE::end(view), value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class T>
+auto find(const std::string& label, const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_impl(label, ex, KE::begin(view), KE::end(view), value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
new file mode 100644
index 000000000..61b54c822
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp
@@ -0,0 +1,149 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_END_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_END_HPP
+
+#include "impl/Kokkos_FindEnd.hpp"
+#include "Kokkos_Equal.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1: no binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
+                       IteratorType1 last, IteratorType2 s_first,
+                       IteratorType2 s_last) {
+  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
+                             last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
+                       IteratorType1 first, IteratorType1 last,
+                       IteratorType2 s_first, IteratorType2 s_last) {
+  return Impl::find_end_impl(label, ex, first, last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto find_end(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view,
+              const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
+                             KE::begin(view), KE::end(view), KE::begin(s_view),
+                             KE::end(s_view));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto find_end(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view,
+              const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
+                             KE::begin(s_view), KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
+                       IteratorType1 last, IteratorType2 s_first,
+                       IteratorType2 s_last, const BinaryPredicateType& pred) {
+  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
+                             last, s_first, s_last, pred);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
+                       IteratorType1 first, IteratorType1 last,
+                       IteratorType2 s_first, IteratorType2 s_last,
+                       const BinaryPredicateType& pred) {
+  return Impl::find_end_impl(label, ex, first, last, s_first, s_last, pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto find_end(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view,
+              const ::Kokkos::View<DataType2, Properties2...>& s_view,
+              const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
+                             KE::begin(view), KE::end(view), KE::begin(s_view),
+                             KE::end(s_view), pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto find_end(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view,
+              const ::Kokkos::View<DataType2, Properties2...>& s_view,
+              const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
+                             KE::begin(s_view), KE::end(s_view), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
new file mode 100644
index 000000000..b8c27cb27
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp
@@ -0,0 +1,150 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_FIRST_OF_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_FIRST_OF_HPP
+
+#include "impl/Kokkos_FindFirstOf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1: no binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
+                            IteratorType1 last, IteratorType2 s_first,
+                            IteratorType2 s_last) {
+  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
+                                  ex, first, last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 s_first, IteratorType2 s_last) {
+  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto find_first_of(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& view,
+                   const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
+                                  KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto find_first_of(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& view,
+                   const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
+                            IteratorType1 last, IteratorType2 s_first,
+                            IteratorType2 s_last,
+                            const BinaryPredicateType& pred) {
+  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
+                                  ex, first, last, s_first, s_last, pred);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 s_first, IteratorType2 s_last,
+                            const BinaryPredicateType& pred) {
+  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last,
+                                  pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto find_first_of(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& view,
+                   const ::Kokkos::View<DataType2, Properties2...>& s_view,
+                   const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
+                                  KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view), pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto find_first_of(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& view,
+                   const ::Kokkos::View<DataType2, Properties2...>& s_view,
+                   const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
+                                  KE::begin(s_view), KE::end(s_view), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
new file mode 100644
index 000000000..54896da11
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIf.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_IF_HPP
+
+#include "impl/Kokkos_FindIfOrNot.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+IteratorType find_if(const ExecutionSpace& ex, IteratorType first,
+                     IteratorType last, PredicateType predicate) {
+  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_iterator_api_default",
+                                         ex, first, last, std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+IteratorType find_if(const std::string& label, const ExecutionSpace& ex,
+                     IteratorType first, IteratorType last,
+                     PredicateType predicate) {
+  return Impl::find_if_or_not_impl<true>(label, ex, first, last,
+                                         std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto find_if(const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& v,
+             Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_view_api_default", ex,
+                                         KE::begin(v), KE::end(v),
+                                         std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto find_if(const std::string& label, const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& v,
+             Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_impl<true>(label, ex, KE::begin(v), KE::end(v),
+                                         std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
new file mode 100644
index 000000000..cfe6bb84d
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp
@@ -0,0 +1,98 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_IF_NOT_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_IF_NOT_HPP
+
+#include "impl/Kokkos_FindIfOrNot.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+IteratorType find_if_not(const ExecutionSpace& ex, IteratorType first,
+                         IteratorType last, Predicate predicate) {
+  return Impl::find_if_or_not_impl<false>(
+      "Kokkos::find_if_not_iterator_api_default", ex, first, last,
+      std::move(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+IteratorType find_if_not(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last,
+                         Predicate predicate) {
+  return Impl::find_if_or_not_impl<false>(label, ex, first, last,
+                                          std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto find_if_not(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_impl<false>(
+      "Kokkos::find_if_not_view_api_default", ex, KE::begin(v), KE::end(v),
+      std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+auto find_if_not(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::find_if_or_not_impl<false>(label, ex, KE::begin(v), KE::end(v),
+                                          std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
new file mode 100644
index 000000000..8a2f90e82
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FOR_EACH_HPP
+#define KOKKOS_STD_ALGORITHMS_FOR_EACH_HPP
+
+#include "impl/Kokkos_ForEachForEachN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
+UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
+                          IteratorType first, IteratorType last,
+                          UnaryFunctorType functor) {
+  return Impl::for_each_impl(label, ex, first, last, std::move(functor));
+}
+
+template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
+UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first,
+                          IteratorType last, UnaryFunctorType functor) {
+  return Impl::for_each_impl("Kokkos::for_each_iterator_api_default", ex, first,
+                             last, std::move(functor));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class UnaryFunctorType>
+UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
+                          const ::Kokkos::View<DataType, Properties...>& v,
+                          UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_impl(label, ex, KE::begin(v), KE::end(v),
+                             std::move(functor));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class UnaryFunctorType>
+UnaryFunctorType for_each(const ExecutionSpace& ex,
+                          const ::Kokkos::View<DataType, Properties...>& v,
+                          UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_impl("Kokkos::for_each_view_api_default", ex,
+                             KE::begin(v), KE::end(v), std::move(functor));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
new file mode 100644
index 000000000..dd917a33e
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FOR_EACH_N_HPP
+#define KOKKOS_STD_ALGORITHMS_FOR_EACH_N_HPP
+
+#include "impl/Kokkos_ForEachForEachN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class UnaryFunctorType>
+IteratorType for_each_n(const std::string& label, const ExecutionSpace& ex,
+                        IteratorType first, SizeType n,
+                        UnaryFunctorType functor) {
+  return Impl::for_each_n_impl(label, ex, first, n, std::move(functor));
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class UnaryFunctorType>
+IteratorType for_each_n(const ExecutionSpace& ex, IteratorType first,
+                        SizeType n, UnaryFunctorType functor) {
+  return Impl::for_each_n_impl("Kokkos::for_each_n_iterator_api_default", ex,
+                               first, n, std::move(functor));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class UnaryFunctorType>
+auto for_each_n(const std::string& label, const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
+                UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_n_impl(label, ex, KE::begin(v), n, std::move(functor));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class UnaryFunctorType>
+auto for_each_n(const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
+                UnaryFunctorType functor) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::for_each_n_impl("Kokkos::for_each_n_view_api_default", ex,
+                               KE::begin(v), n, std::move(functor));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
new file mode 100644
index 000000000..955cb42d4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Generate.hpp
@@ -0,0 +1,91 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_GENERATE_HPP
+#define KOKKOS_STD_ALGORITHMS_GENERATE_HPP
+
+#include "impl/Kokkos_GenerateGenerateN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class Generator>
+void generate(const ExecutionSpace& ex, IteratorType first, IteratorType last,
+              Generator g) {
+  Impl::generate_impl("Kokkos::generate_iterator_api_default", ex, first, last,
+                      std::move(g));
+}
+
+template <class ExecutionSpace, class IteratorType, class Generator>
+void generate(const std::string& label, const ExecutionSpace& ex,
+              IteratorType first, IteratorType last, Generator g) {
+  Impl::generate_impl(label, ex, first, last, std::move(g));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Generator>
+void generate(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  Impl::generate_impl("Kokkos::generate_view_api_default", ex, begin(view),
+                      end(view), std::move(g));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Generator>
+void generate(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  Impl::generate_impl(label, ex, begin(view), end(view), std::move(g));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
new file mode 100644
index 000000000..470edb159
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_GENERATE_N_HPP
+#define KOKKOS_STD_ALGORITHMS_GENERATE_N_HPP
+
+#include "impl/Kokkos_GenerateGenerateN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class Size, class Generator>
+IteratorType generate_n(const ExecutionSpace& ex, IteratorType first,
+                        Size count, Generator g) {
+  Impl::generate_n_impl("Kokkos::generate_n_iterator_api_default", ex, first,
+                        count, std::move(g));
+  return first + count;
+}
+
+template <class ExecutionSpace, class IteratorType, class Size, class Generator>
+IteratorType generate_n(const std::string& label, const ExecutionSpace& ex,
+                        IteratorType first, Size count, Generator g) {
+  Impl::generate_n_impl(label, ex, first, count, std::move(g));
+  return first + count;
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class Size,
+          class Generator>
+auto generate_n(const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& view, Size count,
+                Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::generate_n_impl("Kokkos::generate_n_view_api_default", ex,
+                               begin(view), count, std::move(g));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties, class Size,
+          class Generator>
+auto generate_n(const std::string& label, const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& view, Size count,
+                Generator g) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::generate_n_impl(label, ex, begin(view), count, std::move(g));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
new file mode 100644
index 000000000..c34b5f43c
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp
@@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_INCLUSIVE_SCAN_HPP
+#define KOKKOS_STD_ALGORITHMS_INCLUSIVE_SCAN_HPP
+
+#include "impl/Kokkos_InclusiveScan.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest) {
+  return Impl::inclusive_scan_default_op_impl(
+      "Kokkos::inclusive_scan_default_functors_iterator_api", ex, first, last,
+      first_dest);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+               InputIteratorType first, InputIteratorType last,
+               OutputIteratorType first_dest) {
+  return Impl::inclusive_scan_default_op_impl(label, ex, first, last,
+                                              first_dest);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto inclusive_scan(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_default_op_impl(
+      "Kokkos::inclusive_scan_default_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
+                                              KE::cend(view_from),
+                                              KE::begin(view_dest));
+}
+
+// overload set 2 (accepting custom binary op)
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               BinaryOp binary_op) {
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
+      first_dest, binary_op);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+               InputIteratorType first, InputIteratorType last,
+               OutputIteratorType first_dest, BinaryOp binary_op) {
+  return Impl::inclusive_scan_custom_binary_op_impl(label, ex, first, last,
+                                                    first_dest, binary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOp>
+auto inclusive_scan(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    BinaryOp binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      "Kokkos::inclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      binary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOp>
+auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    BinaryOp binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op);
+}
+
+// overload set 3
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp, class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+               InputIteratorType last, OutputIteratorType first_dest,
+               BinaryOp binary_op, ValueType init_value) {
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
+      first_dest, binary_op, init_value);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp, class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+               InputIteratorType first, InputIteratorType last,
+               OutputIteratorType first_dest, BinaryOp binary_op,
+               ValueType init_value) {
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      label, ex, first, last, first_dest, binary_op, init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOp,
+          class ValueType>
+auto inclusive_scan(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    BinaryOp binary_op, ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      "Kokkos::inclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      binary_op, init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOp,
+          class ValueType>
+auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    BinaryOp binary_op, ValueType init_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::inclusive_scan_custom_binary_op_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, init_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
new file mode 100644
index 000000000..8a2ca207a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_PARTITIONED_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_PARTITIONED_HPP
+
+#include "impl/Kokkos_IsPartitioned.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+bool is_partitioned(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType last, PredicateType p) {
+  return Impl::is_partitioned_impl(
+      "Kokkos::is_partitioned_iterator_api_default", ex, first, last,
+      std::move(p));
+}
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last, PredicateType p) {
+  return Impl::is_partitioned_impl(label, ex, first, last, std::move(p));
+}
+
+template <class ExecutionSpace, class PredicateType, class DataType,
+          class... Properties>
+bool is_partitioned(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v,
+                    PredicateType p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::is_partitioned_impl("Kokkos::is_partitioned_view_api_default",
+                                   ex, cbegin(v), cend(v), std::move(p));
+}
+
+template <class ExecutionSpace, class PredicateType, class DataType,
+          class... Properties>
+bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v,
+                    PredicateType p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::is_partitioned_impl(label, ex, cbegin(v), cend(v), std::move(p));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
new file mode 100644
index 000000000..0ab466f33
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp
@@ -0,0 +1,131 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_SORTED_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_SORTED_HPP
+
+#include "impl/Kokkos_IsSorted.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+bool is_sorted(const ExecutionSpace& ex, IteratorType first,
+               IteratorType last) {
+  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
+                              first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+bool is_sorted(const std::string& label, const ExecutionSpace& ex,
+               IteratorType first, IteratorType last) {
+  return Impl::is_sorted_impl(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+bool is_sorted(const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
+                              KE::cbegin(view), KE::cend(view));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+bool is_sorted(const std::string& label, const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last,
+               ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
+                              first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+bool is_sorted(const std::string& label, const ExecutionSpace& ex,
+               IteratorType first, IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  return Impl::is_sorted_impl(label, ex, first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ComparatorType>
+bool is_sorted(const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view,
+               ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
+                              KE::cbegin(view), KE::cend(view),
+                              std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ComparatorType>
+bool is_sorted(const std::string& label, const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view,
+               ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view),
+                              std::move(comp));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
new file mode 100644
index 000000000..c480d9ee5
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp
@@ -0,0 +1,134 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_SORTED_UNTIL_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_SORTED_UNTIL_HPP
+
+#include "impl/Kokkos_IsSortedUntil.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
+                             IteratorType last) {
+  return Impl::is_sorted_until_impl(
+      "Kokkos::is_sorted_until_iterator_api_default", ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last) {
+  return Impl::is_sorted_until_impl(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto is_sorted_until(const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
+                                    ex, KE::begin(view), KE::end(view));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
+                             IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  return Impl::is_sorted_until_impl(
+      "Kokkos::is_sorted_until_iterator_api_default", ex, first, last,
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last,
+                             ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::is_sorted_until_impl(label, ex, first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ComparatorType>
+auto is_sorted_until(const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& view,
+                     ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
+                                    ex, KE::begin(view), KE::end(view),
+                                    std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ComparatorType>
+auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& view,
+                     ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view),
+                                    std::move(comp));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingOperations.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
similarity index 79%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingOperations.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
index f8ca3456e..1174740a5 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingOperations.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp
@@ -42,38 +42,17 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_MODIFYING_OPERATIONS_HPP
-#define KOKKOS_MODIFYING_OPERATIONS_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_ITER_SWAP_HPP
+#define KOKKOS_STD_ALGORITHMS_ITER_SWAP_HPP
 
 #include <Kokkos_Core.hpp>
-#include "Kokkos_BeginEnd.hpp"
-#include "Kokkos_Constraints.hpp"
+#include "impl/Kokkos_Constraints.hpp"
+#include "Kokkos_Swap.hpp"
 
 namespace Kokkos {
 namespace Experimental {
-
-// move
-template <typename T>
-KOKKOS_INLINE_FUNCTION std::remove_reference_t<T>&& move(T&& t) {
-  return static_cast<std::remove_reference_t<T>&&>(t);
-}
-
-// swap
-template <class T>
-KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept {
-  static_assert(
-      std::is_move_assignable<T>::value && std::is_move_constructible<T>::value,
-      "Kokkos::Experimental::swap arguments must be move assignable "
-      "and move constructible");
-
-  T tmp = std::move(a);
-  a     = std::move(b);
-  b     = std::move(tmp);
-}
-
-//----------------------------------------------------------------------------
-// this is here because we use the swap function above
 namespace Impl {
+
 template <class IteratorType1, class IteratorType2>
 struct StdIterSwapFunctor {
   IteratorType1 m_a;
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
new file mode 100644
index 000000000..4704a9ec5
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_LEXICOGRAPHICAL_COMPARE_HPP
+#define KOKKOS_STD_ALGORITHMS_LEXICOGRAPHICAL_COMPARE_HPP
+
+#include "impl/Kokkos_LexicographicalCompare.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
+                             IteratorType1 last1, IteratorType2 first2,
+                             IteratorType2 last2) {
+  return Impl::lexicographical_compare_impl(
+      "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
+      first2, last2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType1 first1, IteratorType1 last1,
+                             IteratorType2 first2, IteratorType2 last2) {
+  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
+                                            last2);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+bool lexicographical_compare(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_impl(
+      "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
+      KE::cend(view1), KE::cbegin(view2), KE::cend(view2));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+bool lexicographical_compare(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
+                                            KE::cend(view1), KE::cbegin(view2),
+                                            KE::cend(view2));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class ComparatorType>
+bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
+                             IteratorType1 last1, IteratorType2 first2,
+                             IteratorType2 last2, ComparatorType comp) {
+  return Impl::lexicographical_compare_impl(
+      "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
+      first2, last2, comp);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class ComparatorType>
+bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType1 first1, IteratorType1 last1,
+                             IteratorType2 first2, IteratorType2 last2,
+                             ComparatorType comp) {
+  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
+                                            last2, comp);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ComparatorType>
+bool lexicographical_compare(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_impl(
+      "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
+      KE::cend(view1), KE::cbegin(view2), KE::cend(view2), comp);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ComparatorType>
+bool lexicographical_compare(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view1,
+    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
+                                            KE::cend(view1), KE::cbegin(view2),
+                                            KE::cend(view2), comp);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
new file mode 100644
index 000000000..5f6e5cbf6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp
@@ -0,0 +1,132 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MAX_ELEMENT_HPP
+#define KOKKOS_STD_ALGORITHMS_MAX_ELEMENT_HPP
+
+#include "impl/Kokkos_MinMaxMinmaxElement.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+auto max_element(const ExecutionSpace& ex, IteratorType first,
+                 IteratorType last) {
+  return Impl::min_or_max_element_impl<MaxFirstLoc>(
+      "Kokkos::max_element_iterator_api_default", ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+auto max_element(const std::string& label, const ExecutionSpace& ex,
+                 IteratorType first, IteratorType last) {
+  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto max_element(const ExecutionSpace& ex, IteratorType first,
+                 IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+      "Kokkos::max_element_iterator_api_default", ex, first, last,
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto max_element(const std::string& label, const ExecutionSpace& ex,
+                 IteratorType first, IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+      label, ex, first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto max_element(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_impl<MaxFirstLoc>(
+      "Kokkos::max_element_view_api_default", ex, begin(v), end(v));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto max_element(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, begin(v),
+                                                    end(v));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto max_element(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+      "Kokkos::max_element_view_api_default", ex, begin(v), end(v),
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto max_element(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
+      label, ex, begin(v), end(v), std::move(comp));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
new file mode 100644
index 000000000..63cc54896
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp
@@ -0,0 +1,132 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MIN_ELEMENT_HPP
+#define KOKKOS_STD_ALGORITHMS_MIN_ELEMENT_HPP
+
+#include "impl/Kokkos_MinMaxMinmaxElement.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+auto min_element(const ExecutionSpace& ex, IteratorType first,
+                 IteratorType last) {
+  return Impl::min_or_max_element_impl<MinFirstLoc>(
+      "Kokkos::min_element_iterator_api_default", ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+auto min_element(const std::string& label, const ExecutionSpace& ex,
+                 IteratorType first, IteratorType last) {
+  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto min_element(const ExecutionSpace& ex, IteratorType first,
+                 IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+      "Kokkos::min_element_iterator_api_default", ex, first, last,
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto min_element(const std::string& label, const ExecutionSpace& ex,
+                 IteratorType first, IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+      label, ex, first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto min_element(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_impl<MinFirstLoc>(
+      "Kokkos::min_element_view_api_default", ex, begin(v), end(v));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto min_element(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+      "Kokkos::min_element_view_api_default", ex, begin(v), end(v),
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto min_element(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, begin(v),
+                                                    end(v));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto min_element(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& v,
+                 ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
+      label, ex, begin(v), end(v), std::move(comp));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
new file mode 100644
index 000000000..07cdefcc0
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MINMAX_ELEMENT_HPP
+#define KOKKOS_STD_ALGORITHMS_MINMAX_ELEMENT_HPP
+
+#include "impl/Kokkos_MinMaxMinmaxElement.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+auto minmax_element(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType last) {
+  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
+      "Kokkos::minmax_element_iterator_api_default", ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+auto minmax_element(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last) {
+  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto minmax_element(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType last, ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+      "Kokkos::minmax_element_iterator_api_default", ex, first, last,
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+auto minmax_element(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last,
+                    ComparatorType comp) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+      label, ex, first, last, std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto minmax_element(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
+      "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto minmax_element(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, begin(v),
+                                                       end(v));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto minmax_element(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v,
+                    ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+      "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v),
+      std::move(comp));
+}
+
+template <class ExecutionSpace, class DataType, class ComparatorType,
+          class... Properties>
+auto minmax_element(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType, Properties...>& v,
+                    ComparatorType comp) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
+      label, ex, begin(v), end(v), std::move(comp));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElementOperations.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElementOperations.hpp
deleted file mode 100644
index aa8f5ba37..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElementOperations.hpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STD_MIN_MAX_ELEMENT_OPERATIONS_HPP
-#define KOKKOS_STD_MIN_MAX_ELEMENT_OPERATIONS_HPP
-
-#include <Kokkos_Core.hpp>
-#include "Kokkos_BeginEnd.hpp"
-#include "Kokkos_Constraints.hpp"
-#include "Kokkos_Distance.hpp"
-#include "Kokkos_ModifyingOperations.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-template <class IteratorType, class ReducerType>
-struct StdMinOrMaxElemFunctor {
-  using index_type     = typename IteratorType::difference_type;
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& red_value) const {
-    m_reducer.join(red_value, red_value_type{m_first[i], i});
-  }
-
-  KOKKOS_FUNCTION
-  StdMinOrMaxElemFunctor(IteratorType first, ReducerType reducer)
-      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
-};
-
-template <class IteratorType, class ReducerType>
-struct StdMinMaxElemFunctor {
-  using index_type     = typename IteratorType::difference_type;
-  using red_value_type = typename ReducerType::value_type;
-  IteratorType m_first;
-  ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& red_value) const {
-    const auto& my_value = m_first[i];
-    m_reducer.join(red_value, red_value_type{my_value, my_value, i, i});
-  }
-
-  KOKKOS_FUNCTION
-  StdMinMaxElemFunctor(IteratorType first, ReducerType reducer)
-      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
-};
-
-// ------------------------------------------
-// min_or_max_element_impl
-// ------------------------------------------
-template <template <class... Args> class ReducerType, class ExecutionSpace,
-          class IteratorType, class... Args>
-IteratorType min_or_max_element_impl(const std::string& label,
-                                     const ExecutionSpace& ex,
-                                     IteratorType first, IteratorType last,
-                                     Args&&... args) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return last;
-  }
-
-  // aliases
-  using index_type           = typename IteratorType::difference_type;
-  using value_type           = typename IteratorType::value_type;
-  using reducer_type         = ReducerType<value_type, index_type, Args...>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t = StdMinOrMaxElemFunctor<IteratorType, reducer_type>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result, std::forward<Args>(args)...);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, reducer), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // return
-  return first + red_result.loc;
-}
-
-// ------------------------------------------
-// minmax_element_impl
-// ------------------------------------------
-template <template <class... Args> class ReducerType, class ExecutionSpace,
-          class IteratorType, class... Args>
-::Kokkos::pair<IteratorType, IteratorType> minmax_element_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType first,
-    IteratorType last, Args&&... args) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return {first, first};
-  }
-
-  // aliases
-  using index_type           = typename IteratorType::difference_type;
-  using value_type           = typename IteratorType::value_type;
-  using reducer_type         = ReducerType<value_type, index_type, Args...>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t               = StdMinMaxElemFunctor<IteratorType, reducer_type>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result, std::forward<Args>(args)...);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, reducer), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // return
-  return {first + red_result.min_loc, first + red_result.max_loc};
-}
-
-}  // end namespace Impl
-
-// ----------------------
-// min_element public API
-// ----------------------
-template <class ExecutionSpace, class IteratorType>
-auto min_element(const ExecutionSpace& ex, IteratorType first,
-                 IteratorType last) {
-  return Impl::min_or_max_element_impl<MinFirstLoc>(
-      "Kokkos::min_element_iterator_api_default", ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-auto min_element(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last) {
-  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto min_element(const ExecutionSpace& ex, IteratorType first,
-                 IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
-      "Kokkos::min_element_iterator_api_default", ex, first, last,
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto min_element(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
-      label, ex, first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto min_element(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::min_or_max_element_impl<MinFirstLoc>(
-      "Kokkos::min_element_view_api_default", ex, begin(v), end(v));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto min_element(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
-      "Kokkos::min_element_view_api_default", ex, begin(v), end(v),
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto min_element(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::min_or_max_element_impl<MinFirstLoc>(label, ex, begin(v),
-                                                    end(v));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto min_element(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MinFirstLocCustomComparator>(
-      label, ex, begin(v), end(v), std::move(comp));
-}
-
-// ----------------------
-// max_element public API
-// ----------------------
-template <class ExecutionSpace, class IteratorType>
-auto max_element(const ExecutionSpace& ex, IteratorType first,
-                 IteratorType last) {
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(
-      "Kokkos::max_element_iterator_api_default", ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-auto max_element(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last) {
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto max_element(const ExecutionSpace& ex, IteratorType first,
-                 IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
-      "Kokkos::max_element_iterator_api_default", ex, first, last,
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto max_element(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
-      label, ex, first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto max_element(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(
-      "Kokkos::max_element_view_api_default", ex, begin(v), end(v));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto max_element(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::min_or_max_element_impl<MaxFirstLoc>(label, ex, begin(v),
-                                                    end(v));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto max_element(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
-      "Kokkos::max_element_view_api_default", ex, begin(v), end(v),
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto max_element(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::min_or_max_element_impl<MaxFirstLocCustomComparator>(
-      label, ex, begin(v), end(v), std::move(comp));
-}
-
-// -------------------------
-// minmax_element public API
-// -------------------------
-template <class ExecutionSpace, class IteratorType>
-auto minmax_element(const ExecutionSpace& ex, IteratorType first,
-                    IteratorType last) {
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
-      "Kokkos::minmax_element_iterator_api_default", ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-auto minmax_element(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last) {
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto minmax_element(const ExecutionSpace& ex, IteratorType first,
-                    IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
-      "Kokkos::minmax_element_iterator_api_default", ex, first, last,
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-auto minmax_element(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last,
-                    ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
-      label, ex, first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto minmax_element(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(
-      "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto minmax_element(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLoc>(label, ex, begin(v),
-                                                       end(v));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto minmax_element(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v,
-                    ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
-      "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v),
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class ComparatorType,
-          class... Properties>
-auto minmax_element(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v,
-                    ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::minmax_element_impl<MinMaxFirstLastLocCustomComparator>(
-      label, ex, begin(v), end(v), std::move(comp));
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
new file mode 100644
index 000000000..3418e048a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp
@@ -0,0 +1,160 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MISMATCH_HPP
+#define KOKKOS_STD_ALGORITHMS_MISMATCH_HPP
+
+#include "impl/Kokkos_Mismatch.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// FIXME: add mismatch overloads accepting 3 iterators.
+// An overload consistent with other algorithms:
+//
+// auto mismatch(const ExecSpace& ex, It1 first1, It1 last1, It2 first2) {...}
+//
+// makes API ambiguous (with the overload accepting views).
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch(const ExecutionSpace& ex,
+                                                      IteratorType1 first1,
+                                                      IteratorType1 last1,
+                                                      IteratorType2 first2,
+                                                      IteratorType2 last2) {
+  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
+                             first1, last1, first2, last2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
+    const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
+    IteratorType2 first2, IteratorType2 last2,
+    BinaryPredicateType&& predicate) {
+  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
+                             first1, last1, first2, last2,
+                             std::forward<BinaryPredicateType>(predicate));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
+  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
+    BinaryPredicateType&& predicate) {
+  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2,
+                             std::forward<BinaryPredicateType>(predicate));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto mismatch(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view1,
+              const ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
+                             KE::begin(view1), KE::end(view1), KE::begin(view2),
+                             KE::end(view2));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto mismatch(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view1,
+              const ::Kokkos::View<DataType2, Properties2...>& view2,
+              BinaryPredicateType&& predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
+                             KE::begin(view1), KE::end(view1), KE::begin(view2),
+                             KE::end(view2),
+                             std::forward<BinaryPredicateType>(predicate));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto mismatch(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view1,
+              const ::Kokkos::View<DataType2, Properties2...>& view2) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
+                             KE::begin(view2), KE::end(view2));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto mismatch(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType1, Properties1...>& view1,
+              const ::Kokkos::View<DataType2, Properties2...>& view2,
+              BinaryPredicateType&& predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
+                             KE::begin(view2), KE::end(view2),
+                             std::forward<BinaryPredicateType>(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
new file mode 100644
index 000000000..c2ce4655f
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MOVE_HPP
+#define KOKKOS_STD_ALGORITHMS_MOVE_HPP
+
+#include "impl/Kokkos_Move.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator move(const ExecutionSpace& ex, InputIterator first,
+                    InputIterator last, OutputIterator d_first) {
+  return Impl::move_impl("Kokkos::move_iterator_api_default", ex, first, last,
+                         d_first);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator move(const std::string& label, const ExecutionSpace& ex,
+                    InputIterator first, InputIterator last,
+                    OutputIterator d_first) {
+  return Impl::move_impl(label, ex, first, last, d_first);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto move(const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType1, Properties1...>& source,
+          ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_impl("Kokkos::move_view_api_default", ex, begin(source),
+                         end(source), begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto move(const std::string& label, const ExecutionSpace& ex,
+          const ::Kokkos::View<DataType1, Properties1...>& source,
+          ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_impl(label, ex, begin(source), end(source), begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
new file mode 100644
index 000000000..f7462d52d
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MOVE_BACKWARD_HPP
+#define KOKKOS_STD_ALGORITHMS_MOVE_BACKWARD_HPP
+
+#include "impl/Kokkos_MoveBackward.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 move_backward(const ExecutionSpace& ex, IteratorType1 first,
+                            IteratorType1 last, IteratorType2 d_last) {
+  return Impl::move_backward_impl("Kokkos::move_backward_iterator_api_default",
+                                  ex, first, last, d_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto move_backward(const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& source,
+                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_backward_impl("Kokkos::move_backward_view_api_default", ex,
+                                  begin(source), end(source), end(dest));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 move_backward(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 d_last) {
+  return Impl::move_backward_impl(label, ex, first, last, d_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto move_backward(const std::string& label, const ExecutionSpace& ex,
+                   const ::Kokkos::View<DataType1, Properties1...>& source,
+                   ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::move_backward_impl(label, ex, begin(source), end(source),
+                                  end(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp
deleted file mode 100644
index d273f092a..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp
+++ /dev/null
@@ -1,2406 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_NON_MODIFYING_SEQUENCE_OPERATIONS_HPP
-#define KOKKOS_NON_MODIFYING_SEQUENCE_OPERATIONS_HPP
-
-#include <Kokkos_Core.hpp>
-#include "Kokkos_BeginEnd.hpp"
-#include "Kokkos_Constraints.hpp"
-#include "Kokkos_ModifyingOperations.hpp"
-#include "Kokkos_HelperPredicates.hpp"
-#include "Kokkos_RandomAccessIterator.hpp"
-#include "Kokkos_Distance.hpp"
-#include <string>
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-// ------------------------------------------
-//
-// functors
-//
-// ------------------------------------------
-
-template <bool is_find_if, class IndexType, class IteratorType,
-          class ReducerType, class PredicateType>
-struct StdFindIfOrNotFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    const auto& my_value = m_first[i];
-
-    // if doing find_if, look for when predicate is true
-    // if doing find_if_not, look for when predicate is false
-    const bool found_condition = is_find_if ? m_p(my_value) : !m_p(my_value);
-
-    auto rv =
-        found_condition
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdFindIfOrNotFunctor(IteratorType first, ReducerType reducer,
-                        PredicateType p)
-      : m_first(std::move(first)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IteratorType, class UnaryFunctorType>
-struct StdForEachFunctor {
-  using index_type = typename IteratorType::difference_type;
-  IteratorType m_first;
-  UnaryFunctorType m_functor;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const { m_functor(m_first[i]); }
-
-  KOKKOS_FUNCTION
-  StdForEachFunctor(IteratorType _first, UnaryFunctorType _functor)
-      : m_first(std::move(_first)), m_functor(std::move(_functor)) {}
-};
-
-template <class IteratorType, class Predicate>
-struct StdCountIfFunctor {
-  using index_type = typename IteratorType::difference_type;
-  IteratorType m_first;
-  Predicate m_predicate;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i, index_type& lsum) const {
-    if (m_predicate(m_first[i])) {
-      lsum++;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdCountIfFunctor(IteratorType _first, Predicate _predicate)
-      : m_first(std::move(_first)), m_predicate(std::move(_predicate)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class BinaryPredicateType>
-struct StdMismatchRedFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType1 m_first1;
-  IteratorType2 m_first2;
-  ReducerType m_reducer;
-  BinaryPredicateType m_predicate;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    const auto& my_value1 = m_first1[i];
-    const auto& my_value2 = m_first2[i];
-
-    auto rv =
-        !m_predicate(my_value1, my_value2)
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdMismatchRedFunctor(IteratorType1 first1, IteratorType2 first2,
-                        ReducerType reducer, BinaryPredicateType predicate)
-      : m_first1(std::move(first1)),
-        m_first2(std::move(first2)),
-        m_reducer(std::move(reducer)),
-        m_predicate(std::move(predicate)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-struct StdEqualFunctor {
-  IteratorType1 m_first1;
-  IteratorType2 m_first2;
-  BinaryPredicateType m_predicate;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i, std::size_t& lsum) const {
-    if (!m_predicate(m_first1[i], m_first2[i])) {
-      lsum = 1;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdEqualFunctor(IteratorType1 _first1, IteratorType2 _first2,
-                  BinaryPredicateType _predicate)
-      : m_first1(std::move(_first1)),
-        m_first2(std::move(_first2)),
-        m_predicate(std::move(_predicate)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class ComparatorType>
-struct StdLexicographicalCompareFunctor {
-  using red_value_type = typename ReducerType::value_type;
-  IteratorType1 m_first1;
-  IteratorType2 m_first2;
-  ReducerType m_reducer;
-  ComparatorType m_comparator;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    const auto& my_value1 = m_first1[i];
-    const auto& my_value2 = m_first2[i];
-
-    bool different = m_comparator(my_value1, my_value2) ||
-                     m_comparator(my_value2, my_value1);
-    auto rv =
-        different
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdLexicographicalCompareFunctor(IteratorType1 _first1, IteratorType2 _first2,
-                                   ReducerType _reducer, ComparatorType _comp)
-      : m_first1(std::move(_first1)),
-        m_first2(std::move(_first2)),
-        m_reducer(std::move(_reducer)),
-        m_comparator(std::move(_comp)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ComparatorType>
-struct StdCompareFunctor {
-  IteratorType1 m_it1;
-  IteratorType2 m_it2;
-  ComparatorType m_predicate;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType /* i is unused */, int& lsum) const {
-    if (m_predicate(*m_it1, *m_it2)) {
-      lsum = 1;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdCompareFunctor(IteratorType1 _it1, IteratorType2 _it2,
-                    ComparatorType _predicate)
-      : m_it1(std::move(_it1)),
-        m_it2(std::move(_it2)),
-        m_predicate(std::move(_predicate)) {}
-};
-
-template <class IndexType, class IteratorType, class ReducerType,
-          class PredicateType>
-struct StdAdjacentFindFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    const auto& my_value   = m_first[i];
-    const auto& next_value = m_first[i + 1];
-    const bool are_equal   = m_p(my_value, next_value);
-
-    auto rv =
-        are_equal
-            ? red_value_type{i}
-            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdAdjacentFindFunctor(IteratorType first, ReducerType reducer,
-                         PredicateType p)
-      : m_first(std::move(first)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class PredicateType>
-struct StdSearchFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType1 m_first;
-  IteratorType1 m_last;
-  IteratorType2 m_s_first;
-  IteratorType2 m_s_last;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    namespace KE = ::Kokkos::Experimental;
-    auto myit    = m_first + i;
-    bool found   = true;
-
-    const auto search_count = KE::distance(m_s_first, m_s_last);
-    for (IndexType k = 0; k < search_count; ++k) {
-      // note that we add this EXPECT to check if we are in a valid range
-      // but I think we can remove this beceause the guarantee we don't go
-      // out of bounds is taken care of at the calling site
-      // where we launch the par-reduce.
-      KOKKOS_EXPECTS((myit + k) < m_last);
-
-      if (!m_p(myit[k], m_s_first[k])) {
-        found = false;
-        break;
-      }
-    }
-
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdSearchFunctor(IteratorType1 first, IteratorType1 last,
-                   IteratorType2 s_first, IteratorType2 s_last,
-                   ReducerType reducer, PredicateType p)
-      : m_first(std::move(first)),
-        m_last(std::move(last)),
-        m_s_first(std::move(s_first)),
-        m_s_last(std::move(s_last)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IndexType, class IteratorType, class SizeType, class ValueType,
-          class ReducerType, class PredicateType>
-struct StdSearchNFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType m_first;
-  IteratorType m_last;
-  SizeType m_count;
-  ValueType m_value;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    namespace KE = ::Kokkos::Experimental;
-    auto myit    = m_first + i;
-    bool found   = true;
-
-    for (SizeType k = 0; k < m_count; ++k) {
-      // note that we add this EXPECT to check if we are in a valid range
-      // but I think we can remove this beceause the guarantee we don't go
-      // out of bounds is taken care of at the calling site
-      // where we launch the par-reduce.
-      KOKKOS_EXPECTS((myit + k) < m_last);
-
-      if (!m_p(myit[k], m_value)) {
-        found = false;
-        break;
-      }
-    }
-
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdSearchNFunctor(IteratorType first, IteratorType last, SizeType count,
-                    ValueType value, ReducerType reducer, PredicateType p)
-      : m_first(std::move(first)),
-        m_last(std::move(last)),
-        m_count(std::move(count)),
-        m_value(std::move(value)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class PredicateType>
-struct StdFindFirstOfFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType1 m_first;
-  IteratorType2 m_s_first;
-  IteratorType2 m_s_last;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    namespace KE        = ::Kokkos::Experimental;
-    const auto& myvalue = m_first[i];
-    bool found          = false;
-
-    const auto search_count = KE::distance(m_s_first, m_s_last);
-    for (IndexType k = 0; k < search_count; ++k) {
-      if (m_p(myvalue, m_s_first[k])) {
-        found = true;
-        break;
-      }
-    }
-
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdFindFirstOfFunctor(IteratorType1 first, IteratorType2 s_first,
-                        IteratorType2 s_last, ReducerType reducer,
-                        PredicateType p)
-      : m_first(std::move(first)),
-        m_s_first(std::move(s_first)),
-        m_s_last(std::move(s_last)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class PredicateType>
-struct StdFindEndFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType1 m_first;
-  IteratorType1 m_last;
-  IteratorType2 m_s_first;
-  IteratorType2 m_s_last;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    namespace KE = ::Kokkos::Experimental;
-    auto myit    = m_first + i;
-    bool found   = true;
-
-    const auto search_count = KE::distance(m_s_first, m_s_last);
-    for (IndexType k = 0; k < search_count; ++k) {
-      // note that we add this EXPECT to check if we are in a valid range
-      // but I think we can remvoe this beceause the guarantee we don't go
-      // out of bounds is taken care of at the calling site
-      // where we launch the par-reduce.
-      KOKKOS_EXPECTS((myit + k) < m_last);
-
-      if (!m_p(myit[k], m_s_first[k])) {
-        found = false;
-        break;
-      }
-    }
-
-    const auto rv =
-        found ? red_value_type{i}
-              : red_value_type{::Kokkos::reduction_identity<IndexType>::max()};
-
-    m_reducer.join(red_value, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdFindEndFunctor(IteratorType1 first, IteratorType1 last,
-                    IteratorType2 s_first, IteratorType2 s_last,
-                    ReducerType reducer, PredicateType p)
-      : m_first(std::move(first)),
-        m_last(std::move(last)),
-        m_s_first(std::move(s_first)),
-        m_s_last(std::move(s_last)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-// ------------------------------------------
-// find_if_or_not_impl
-// ------------------------------------------
-template <bool is_find_if, class ExecutionSpace, class IteratorType,
-          class PredicateType>
-IteratorType find_if_or_not_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType first,
-                                 IteratorType last, PredicateType pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(
-      ex, first);  // only need one It per type
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return last;
-  }
-
-  // aliases
-  using index_type           = typename IteratorType::difference_type;
-  using reducer_type         = FirstLoc<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t = StdFindIfOrNotFunctor<is_find_if, index_type, IteratorType,
-                                       reducer_type, PredicateType>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, reducer, pred), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // decide and return
-  if (red_result.min_loc_true ==
-      ::Kokkos::reduction_identity<index_type>::min()) {
-    // here, it means a valid loc has not been found,
-    return last;
-  } else {
-    // a location has been found
-    return first + red_result.min_loc_true;
-  }
-}
-
-// ------------------------------------------
-// find_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class T>
-InputIterator find_impl(const std::string& label, ExecutionSpace ex,
-                        InputIterator first, InputIterator last,
-                        const T& value) {
-  return find_if_or_not_impl<true>(
-      label, ex, first, last,
-      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
-}
-
-// ------------------------------------------
-// for_each_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
-UnaryFunctorType for_each_impl(const std::string& label,
-                               const ExecutionSpace& ex, IteratorType first,
-                               IteratorType last, UnaryFunctorType functor) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor));
-  ex.fence("Kokkos::for_each: fence after operation");
-
-  return functor;
-}
-
-// ------------------------------------------
-// for_each_n_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class UnaryFunctorType>
-IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, SizeType n,
-                             UnaryFunctorType functor) {
-  auto last = first + n;
-  Impl::static_assert_random_access_and_accessible(ex, first, last);
-  Impl::expect_valid_range(first, last);
-
-  if (n == 0) {
-    return first;
-  }
-
-  for_each_impl(label, ex, first, last, std::move(functor));
-  // no neeed to fence since for_each_impl fences already
-
-  return last;
-}
-
-// ------------------------------------------
-// count_if_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class Predicate>
-typename IteratorType::difference_type count_if_impl(const std::string& label,
-                                                     const ExecutionSpace& ex,
-                                                     IteratorType first,
-                                                     IteratorType last,
-                                                     Predicate predicate) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using func_t = StdCountIfFunctor<IteratorType, Predicate>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  typename IteratorType::difference_type count = 0;
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, predicate), count);
-  ex.fence("Kokkos::count_if: fence after operation");
-
-  return count;
-}
-
-// ------------------------------------------
-// count_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class T>
-auto count_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType first, IteratorType last, const T& value) {
-  return count_if_impl(
-      label, ex, first, last,
-      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
-}
-
-// ------------------------------------------
-// mismatch_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
-    BinaryPredicateType predicate) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-  Impl::expect_valid_range(first2, last2);
-
-  // aliases
-  using return_type          = ::Kokkos::pair<IteratorType1, IteratorType2>;
-  using index_type           = typename IteratorType1::difference_type;
-  using reducer_type         = FirstLoc<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using functor_type =
-      StdMismatchRedFunctor<index_type, IteratorType1, IteratorType2,
-                            reducer_type, BinaryPredicateType>;
-
-  // trivial case: note that this is important,
-  // for OpenMPTarget, omitting special handling of
-  // the trivial case was giving all sorts of strange stuff.
-  const auto num_e1 = last1 - first1;
-  const auto num_e2 = last2 - first2;
-  if (num_e1 == 0 || num_e2 == 0) {
-    return return_type(first1, first2);
-  }
-
-  // run
-  const auto num_elemen_par_reduce = (num_e1 <= num_e2) ? num_e1 : num_e2;
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  ::Kokkos::parallel_reduce(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elemen_par_reduce),
-      functor_type(first1, first2, reducer, std::move(predicate)), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // decide and return
-  constexpr auto red_min = ::Kokkos::reduction_identity<index_type>::min();
-  if (red_result.min_loc_true == red_min) {
-    // in here means mismatch has not been found
-    if (num_e1 == num_e2) {
-      return return_type(last1, last2);
-    } else if (num_e1 < num_e2) {
-      return return_type(last1, first2 + num_e1);
-    } else {
-      return return_type(first1 + num_e2, last2);
-    }
-  } else {
-    // in here means mismatch has been found
-    return return_type(first1 + red_result.min_loc_true,
-                       first2 + red_result.min_loc_true);
-  }
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
-  using value_type1 = typename IteratorType1::value_type;
-  using value_type2 = typename IteratorType2::value_type;
-  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return mismatch_impl(label, ex, first1, last1, first2, last2, pred_t());
-}
-
-// ------------------------------------------
-// all_of_impl, any_of_impl, none_of_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool all_of_impl(const std::string& label, const ExecutionSpace& ex,
-                 InputIterator first, InputIterator last, Predicate predicate) {
-  return (find_if_or_not_impl<false>(label, ex, first, last, predicate) ==
-          last);
-}
-
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool any_of_impl(const std::string& label, const ExecutionSpace& ex,
-                 InputIterator first, InputIterator last, Predicate predicate) {
-  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) != last);
-}
-
-template <class ExecutionSpace, class IteratorType, class Predicate>
-bool none_of_impl(const std::string& label, const ExecutionSpace& ex,
-                  IteratorType first, IteratorType last, Predicate predicate) {
-  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) == last);
-}
-
-// ------------------------------------------
-// equal_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                BinaryPredicateType predicate) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t     = StdEqualFunctor<index_type, IteratorType1, IteratorType2,
-                                 BinaryPredicateType>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
-  std::size_t different   = 0;
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first1, first2, predicate), different);
-  ex.fence("Kokkos::equal: fence after operation");
-
-  return !different;
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1,
-                IteratorType2 first2) {
-  using value_type1 = typename IteratorType1::value_type;
-  using value_type2 = typename IteratorType2::value_type;
-  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return equal_impl(label, ex, first1, last1, first2, pred_t());
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                IteratorType2 last2, BinaryPredicateType predicate) {
-  const auto d1 = ::Kokkos::Experimental::distance(first1, last1);
-  const auto d2 = ::Kokkos::Experimental::distance(first2, last2);
-  if (d1 != d2) {
-    return false;
-  }
-
-  return equal_impl(label, ex, first1, last1, first2, predicate);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool equal_impl(const std::string& label, const ExecutionSpace& ex,
-                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
-                IteratorType2 last2) {
-  Impl::expect_valid_range(first1, last1);
-  Impl::expect_valid_range(first2, last2);
-
-  using value_type1 = typename IteratorType1::value_type;
-  using value_type2 = typename IteratorType2::value_type;
-  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return equal_impl(label, ex, first1, last1, first2, last2, pred_t());
-}
-
-// ------------------------------------------
-// lexicographical_compare_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ComparatorType>
-bool lexicographical_compare_impl(const std::string& label,
-                                  const ExecutionSpace& ex,
-                                  IteratorType1 first1, IteratorType1 last1,
-                                  IteratorType2 first2, IteratorType2 last2,
-                                  ComparatorType comp) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-  Impl::expect_valid_range(first2, last2);
-
-  // aliases
-  using index_type           = typename IteratorType1::difference_type;
-  using reducer_type         = FirstLoc<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-
-  // run
-  const auto d1    = Kokkos::Experimental::distance(first1, last1);
-  const auto d2    = Kokkos::Experimental::distance(first2, last2);
-  const auto range = Kokkos::Experimental::min(d1, d2);
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  using func1_t =
-      StdLexicographicalCompareFunctor<index_type, IteratorType1, IteratorType2,
-                                       reducer_type, ComparatorType>;
-
-  ::Kokkos::parallel_reduce(label, RangePolicy<ExecutionSpace>(ex, 0, range),
-                            func1_t(first1, first2, reducer, comp), reducer);
-
-  // fence not needed because reducing into scalar
-  // no mismatch
-  if (red_result.min_loc_true ==
-      ::Kokkos::reduction_identity<index_type>::min()) {
-    auto new_last1 = first1 + range;
-    auto new_last2 = first2 + range;
-    bool is_prefix = (new_last1 == last1) && (new_last2 != last2);
-    return is_prefix;
-  }
-
-  // check mismatched
-  int less      = 0;
-  auto it1      = first1 + red_result.min_loc_true;
-  auto it2      = first2 + red_result.min_loc_true;
-  using func2_t = StdCompareFunctor<index_type, IteratorType1, IteratorType2,
-                                    ComparatorType>;
-  ::Kokkos::parallel_reduce(label, RangePolicy<ExecutionSpace>(ex, 0, 1),
-                            func2_t(it1, it2, comp), less);
-
-  // fence not needed because reducing into scalar
-  return static_cast<bool>(less);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool lexicographical_compare_impl(const std::string& label,
-                                  const ExecutionSpace& ex,
-                                  IteratorType1 first1, IteratorType1 last1,
-                                  IteratorType2 first2, IteratorType2 last2) {
-  using value_type_1 = typename IteratorType1::value_type;
-  using value_type_2 = typename IteratorType2::value_type;
-  using predicate_t =
-      Impl::StdAlgoLessThanBinaryPredicate<value_type_1, value_type_2>;
-  return lexicographical_compare_impl(label, ex, first1, last1, first2, last2,
-                                      predicate_t());
-}
-
-// ------------------------------------------
-// adjacent_find_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType adjacent_find_impl(const std::string& label,
-                                const ExecutionSpace& ex, IteratorType first,
-                                IteratorType last, PredicateType pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-
-  if (num_elements <= 1) {
-    return last;
-  }
-
-  using index_type           = typename IteratorType::difference_type;
-  using reducer_type         = FirstLoc<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t = StdAdjacentFindFunctor<index_type, IteratorType, reducer_type,
-                                        PredicateType>;
-
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-
-  // note that we use below num_elements-1 because
-  // each index i in the reduction checks i and (i+1).
-  ::Kokkos::parallel_reduce(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements - 1),
-      func_t(first, reducer, pred), reducer);
-
-  // fence not needed because reducing into scalar
-  if (red_result.min_loc_true ==
-      ::Kokkos::reduction_identity<index_type>::min()) {
-    return last;
-  } else {
-    return first + red_result.min_loc_true;
-  }
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType adjacent_find_impl(const std::string& label,
-                                const ExecutionSpace& ex, IteratorType first,
-                                IteratorType last) {
-  using value_type     = typename IteratorType::value_type;
-  using default_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
-  return adjacent_find_impl(label, ex, first, last, default_pred_t());
-}
-
-// ------------------------------------------
-// search_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType1 first, IteratorType1 last,
-                          IteratorType2 s_first, IteratorType2 s_last,
-                          const BinaryPredicateType& pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
-  Impl::expect_valid_range(first, last);
-  Impl::expect_valid_range(s_first, s_last);
-
-  // the target sequence should not be larger than the range [first, last)
-  namespace KE            = ::Kokkos::Experimental;
-  const auto num_elements = KE::distance(first, last);
-  const auto s_count      = KE::distance(s_first, s_last);
-  KOKKOS_EXPECTS(num_elements >= s_count);
-  (void)s_count;  // needed when macro above is a no-op
-
-  if (s_first == s_last) {
-    return first;
-  }
-
-  if (first == last) {
-    return last;
-  }
-
-  // special case where the two ranges have equal size
-  if (num_elements == s_count) {
-    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
-    return (equal_result) ? first : last;
-  } else {
-    using index_type           = typename IteratorType1::difference_type;
-    using reducer_type         = FirstLoc<index_type>;
-    using reduction_value_type = typename reducer_type::value_type;
-    using func_t = StdSearchFunctor<index_type, IteratorType1, IteratorType2,
-                                    reducer_type, BinaryPredicateType>;
-
-    // run
-    reduction_value_type red_result;
-    reducer_type reducer(red_result);
-
-    // decide the size of the range policy of the par_red:
-    // note that the last feasible index to start looking is the index
-    // whose distance from the "last" is equal to the sequence count.
-    // the +1 is because we need to include that location too.
-    const auto range_size = num_elements - s_count + 1;
-
-    // run par reduce
-    ::Kokkos::parallel_reduce(
-        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
-        func_t(first, last, s_first, s_last, reducer, pred), reducer);
-
-    // fence not needed because reducing into scalar
-
-    // decide and return
-    if (red_result.min_loc_true ==
-        ::Kokkos::reduction_identity<index_type>::min()) {
-      // location has not been found
-      return last;
-    } else {
-      // location has been found
-      return first + red_result.min_loc_true;
-    }
-  }
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType1 first, IteratorType1 last,
-                          IteratorType2 s_first, IteratorType2 s_last) {
-  using value_type1    = typename IteratorType1::value_type;
-  using value_type2    = typename IteratorType2::value_type;
-  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return search_impl(label, ex, first, last, s_first, s_last, predicate_type());
-}
-
-// ------------------------------------------
-// search_n_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType, class BinaryPredicateType>
-IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last,
-                           SizeType count, const ValueType& value,
-                           const BinaryPredicateType& pred) {
-  // checks
-  static_assert_random_access_and_accessible(ex, first);
-  expect_valid_range(first, last);
-  KOKKOS_EXPECTS((std::ptrdiff_t)count >= 0);
-
-  // count should not be larger than the range [first, last)
-  namespace KE            = ::Kokkos::Experimental;
-  const auto num_elements = KE::distance(first, last);
-  // cast things to avoid compiler warning
-  KOKKOS_EXPECTS((std::size_t)num_elements >= (std::size_t)count);
-
-  if (first == last) {
-    return first;
-  }
-
-  // special case where num elements in [first, last) == count
-  if ((std::size_t)num_elements == (std::size_t)count) {
-    using equal_to_value = StdAlgoEqualsValUnaryPredicate<ValueType>;
-    const auto satisfies =
-        all_of_impl(label, ex, first, last, equal_to_value(value));
-    return (satisfies) ? first : last;
-  } else {
-    // aliases
-    using index_type           = typename IteratorType::difference_type;
-    using reducer_type         = FirstLoc<index_type>;
-    using reduction_value_type = typename reducer_type::value_type;
-    using func_t =
-        StdSearchNFunctor<index_type, IteratorType, SizeType, ValueType,
-                          reducer_type, BinaryPredicateType>;
-
-    // run
-    reduction_value_type red_result;
-    reducer_type reducer(red_result);
-
-    // decide the size of the range policy of the par_red:
-    // the last feasible index to start looking is the index
-    // whose distance from the "last" is equal to count.
-    // the +1 is because we need to include that location too.
-    const auto range_size = num_elements - count + 1;
-
-    // run par reduce
-    ::Kokkos::parallel_reduce(
-        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
-        func_t(first, last, count, value, reducer, pred), reducer);
-
-    // fence not needed because reducing into scalar
-
-    // decide and return
-    if (red_result.min_loc_true ==
-        ::Kokkos::reduction_identity<index_type>::min()) {
-      // location has not been found
-      return last;
-    } else {
-      // location has been found
-      return first + red_result.min_loc_true;
-    }
-  }
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType>
-IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last,
-                           SizeType count, const ValueType& value) {
-  using iter_value_type = typename IteratorType::value_type;
-  using predicate_type =
-      StdAlgoEqualBinaryPredicate<iter_value_type, ValueType>;
-
-  /* above we use <iter_value_type, ValueType> for the predicate_type
-     to be consistent with the standard, which says:
-
-     "
-     The signature of the predicate function should be equivalent to:
-
-        bool pred(const Type1 &a, const Type2 &b);
-
-     The type Type1 must be such that an object of type ForwardIt can be
-     dereferenced and then implicitly converted to Type1. The type Type2 must be
-     such that an object of type T can be implicitly converted to Type2.
-     "
-
-     In our case, IteratorType = ForwardIt, and ValueType = T.
-   */
-
-  return search_n_impl(label, ex, first, last, count, value, predicate_type());
-}
-
-// ------------------------------------------
-// find_first_of_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_first_of_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 s_first,
-                                 IteratorType2 s_last,
-                                 const BinaryPredicateType& pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
-  Impl::expect_valid_range(first, last);
-  Impl::expect_valid_range(s_first, s_last);
-
-  if ((s_first == s_last) || (first == last)) {
-    return last;
-  }
-
-  using index_type           = typename IteratorType1::difference_type;
-  using reducer_type         = FirstLoc<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t = StdFindFirstOfFunctor<index_type, IteratorType1, IteratorType2,
-                                       reducer_type, BinaryPredicateType>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(first, s_first, s_last, reducer, pred), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // decide and return
-  if (red_result.min_loc_true ==
-      ::Kokkos::reduction_identity<index_type>::min()) {
-    // if here, nothing found
-    return last;
-  } else {
-    // a location has been found
-    return first + red_result.min_loc_true;
-  }
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_first_of_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 s_first,
-                                 IteratorType2 s_last) {
-  using value_type1    = typename IteratorType1::value_type;
-  using value_type2    = typename IteratorType2::value_type;
-  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return find_first_of_impl(label, ex, first, last, s_first, s_last,
-                            predicate_type());
-}
-
-// ------------------------------------------
-// find_end_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last,
-                            const BinaryPredicateType& pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
-  Impl::expect_valid_range(first, last);
-  Impl::expect_valid_range(s_first, s_last);
-
-  // the target sequence should not be larger than the range [first, last)
-  namespace KE            = ::Kokkos::Experimental;
-  const auto num_elements = KE::distance(first, last);
-  const auto s_count      = KE::distance(s_first, s_last);
-  KOKKOS_EXPECTS(num_elements >= s_count);
-  (void)s_count;  // needed when macro above is a no-op
-
-  if (s_first == s_last) {
-    return last;
-  }
-
-  if (first == last) {
-    return last;
-  }
-
-  // special case where the two ranges have equal size
-  if (num_elements == s_count) {
-    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
-    return (equal_result) ? first : last;
-  } else {
-    using index_type           = typename IteratorType1::difference_type;
-    using reducer_type         = LastLoc<index_type>;
-    using reduction_value_type = typename reducer_type::value_type;
-    using func_t = StdFindEndFunctor<index_type, IteratorType1, IteratorType2,
-                                     reducer_type, BinaryPredicateType>;
-
-    // run
-    reduction_value_type red_result;
-    reducer_type reducer(red_result);
-
-    // decide the size of the range policy of the par_red:
-    // note that the last feasible index to start looking is the index
-    // whose distance from the "last" is equal to the sequence count.
-    // the +1 is because we need to include that location too.
-    const auto range_size = num_elements - s_count + 1;
-
-    // run par reduce
-    ::Kokkos::parallel_reduce(
-        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
-        func_t(first, last, s_first, s_last, reducer, pred), reducer);
-
-    // fence not needed because reducing into scalar
-
-    // decide and return
-    if (red_result.max_loc_true ==
-        ::Kokkos::reduction_identity<index_type>::max()) {
-      // if here, a subrange has not been found
-      return last;
-    } else {
-      // a location has been found
-      return first + red_result.max_loc_true;
-    }
-  }
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last) {
-  using value_type1    = typename IteratorType1::value_type;
-  using value_type2    = typename IteratorType2::value_type;
-  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-  return find_end_impl(label, ex, first, last, s_first, s_last,
-                       predicate_type());
-}
-
-}  // namespace Impl
-
-// ----------------------------------
-// find public API
-// ----------------------------------
-template <class ExecutionSpace, class InputIterator, class T>
-InputIterator find(const ExecutionSpace& ex, InputIterator first,
-                   InputIterator last, const T& value) {
-  return Impl::find_impl("Kokkos::find_iterator_api_default", ex, first, last,
-                         value);
-}
-
-template <class ExecutionSpace, class InputIterator, class T>
-InputIterator find(const std::string& label, const ExecutionSpace& ex,
-                   InputIterator first, InputIterator last, const T& value) {
-  return Impl::find_impl(label, ex, first, last, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-auto find(const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_impl("Kokkos::find_view_api_default", ex, KE::begin(view),
-                         KE::end(view), value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-auto find(const std::string& label, const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_impl(label, ex, KE::begin(view), KE::end(view), value);
-}
-
-// -------------------
-// find_if public API
-// -------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType find_if(const ExecutionSpace& ex, IteratorType first,
-                     IteratorType last, PredicateType predicate) {
-  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_iterator_api_default",
-                                         ex, first, last, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType find_if(const std::string& label, const ExecutionSpace& ex,
-                     IteratorType first, IteratorType last,
-                     PredicateType predicate) {
-  return Impl::find_if_or_not_impl<true>(label, ex, first, last,
-                                         std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto find_if(const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& v,
-             Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<true>("Kokkos::find_if_view_api_default", ex,
-                                         KE::begin(v), KE::end(v),
-                                         std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto find_if(const std::string& label, const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& v,
-             Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<true>(label, ex, KE::begin(v), KE::end(v),
-                                         std::move(predicate));
-}
-
-// ----------------------------------
-// find_if_not public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class Predicate>
-IteratorType find_if_not(const ExecutionSpace& ex, IteratorType first,
-                         IteratorType last, Predicate predicate) {
-  return Impl::find_if_or_not_impl<false>(
-      "Kokkos::find_if_not_iterator_api_default", ex, first, last,
-      std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType, class Predicate>
-IteratorType find_if_not(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         Predicate predicate) {
-  return Impl::find_if_or_not_impl<false>(label, ex, first, last,
-                                          std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto find_if_not(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<false>(
-      "Kokkos::find_if_not_view_api_default", ex, KE::begin(v), KE::end(v),
-      std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto find_if_not(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& v,
-                 Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_if_or_not_impl<false>(label, ex, KE::begin(v), KE::end(v),
-                                          std::move(predicate));
-}
-
-// ----------------------------------
-// for_each public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
-UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType first, IteratorType last,
-                          UnaryFunctorType functor) {
-  return Impl::for_each_impl(label, ex, first, last, std::move(functor));
-}
-
-template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
-UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first,
-                          IteratorType last, UnaryFunctorType functor) {
-  return Impl::for_each_impl("Kokkos::for_each_iterator_api_default", ex, first,
-                             last, std::move(functor));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryFunctorType>
-UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex,
-                          const ::Kokkos::View<DataType, Properties...>& v,
-                          UnaryFunctorType functor) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_impl(label, ex, KE::begin(v), KE::end(v),
-                             std::move(functor));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryFunctorType>
-UnaryFunctorType for_each(const ExecutionSpace& ex,
-                          const ::Kokkos::View<DataType, Properties...>& v,
-                          UnaryFunctorType functor) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_impl("Kokkos::for_each_view_api_default", ex,
-                             KE::begin(v), KE::end(v), std::move(functor));
-}
-
-// ----------------------------------
-// for_each_n public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class UnaryFunctorType>
-IteratorType for_each_n(const std::string& label, const ExecutionSpace& ex,
-                        IteratorType first, SizeType n,
-                        UnaryFunctorType functor) {
-  return Impl::for_each_n_impl(label, ex, first, n, std::move(functor));
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class UnaryFunctorType>
-IteratorType for_each_n(const ExecutionSpace& ex, IteratorType first,
-                        SizeType n, UnaryFunctorType functor) {
-  return Impl::for_each_n_impl("Kokkos::for_each_n_iterator_api_default", ex,
-                               first, n, std::move(functor));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class UnaryFunctorType>
-auto for_each_n(const std::string& label, const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
-                UnaryFunctorType functor) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_n_impl(label, ex, KE::begin(v), n, std::move(functor));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class UnaryFunctorType>
-auto for_each_n(const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& v, SizeType n,
-                UnaryFunctorType functor) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::for_each_n_impl("Kokkos::for_each_n_view_api_default", ex,
-                               KE::begin(v), n, std::move(functor));
-}
-
-// ----------------------------------
-// count_if public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class Predicate>
-typename IteratorType::difference_type count_if(const ExecutionSpace& ex,
-                                                IteratorType first,
-                                                IteratorType last,
-                                                Predicate predicate) {
-  return Impl::count_if_impl("Kokkos::count_if_iterator_api_default", ex, first,
-                             last, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType, class Predicate>
-typename IteratorType::difference_type count_if(const std::string& label,
-                                                const ExecutionSpace& ex,
-                                                IteratorType first,
-                                                IteratorType last,
-                                                Predicate predicate) {
-  return Impl::count_if_impl(label, ex, first, last, std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto count_if(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& v,
-              Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::count_if_impl("Kokkos::count_if_view_api_default", ex,
-                             KE::cbegin(v), KE::cend(v), std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-auto count_if(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& v,
-              Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::count_if_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                             std::move(predicate));
-}
-
-// ----------------------------------
-// count public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class T>
-typename IteratorType::difference_type count(const ExecutionSpace& ex,
-                                             IteratorType first,
-                                             IteratorType last,
-                                             const T& value) {
-  return Impl::count_impl("Kokkos::count_iterator_api_default", ex, first, last,
-                          value);
-}
-
-template <class ExecutionSpace, class IteratorType, class T>
-typename IteratorType::difference_type count(const std::string& label,
-                                             const ExecutionSpace& ex,
-                                             IteratorType first,
-                                             IteratorType last,
-                                             const T& value) {
-  return Impl::count_impl(label, ex, first, last, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-auto count(const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::count_impl("Kokkos::count_view_api_default", ex, KE::cbegin(v),
-                          KE::cend(v), value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-auto count(const std::string& label, const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType, Properties...>& v, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::count_impl(label, ex, KE::cbegin(v), KE::cend(v), value);
-}
-
-// ----------------------------------
-// mismatch public API
-// ----------------------------------
-// FIXME: add mismatch overloads accepting 3 iterators.
-// An overload consistent with other algorithms:
-//
-// auto mismatch(const ExecSpace& ex, It1 first1, It1 last1, It2 first2) {...}
-//
-// makes API ambiguous (with the overload accepting views).
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch(const ExecutionSpace& ex,
-                                                      IteratorType1 first1,
-                                                      IteratorType1 last1,
-                                                      IteratorType2 first2,
-                                                      IteratorType2 last2) {
-  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
-                             first1, last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
-    const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-    IteratorType2 first2, IteratorType2 last2,
-    BinaryPredicateType&& predicate) {
-  return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex,
-                             first1, last1, first2, last2,
-                             std::forward<BinaryPredicateType>(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
-  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-::Kokkos::pair<IteratorType1, IteratorType2> mismatch(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
-    BinaryPredicateType&& predicate) {
-  return Impl::mismatch_impl(label, ex, first1, last1, first2, last2,
-                             std::forward<BinaryPredicateType>(predicate));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto mismatch(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view1,
-              const ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
-                             KE::begin(view1), KE::end(view1), KE::begin(view2),
-                             KE::end(view2));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto mismatch(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view1,
-              const ::Kokkos::View<DataType2, Properties2...>& view2,
-              BinaryPredicateType&& predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex,
-                             KE::begin(view1), KE::end(view1), KE::begin(view2),
-                             KE::end(view2),
-                             std::forward<BinaryPredicateType>(predicate));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto mismatch(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view1,
-              const ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
-                             KE::begin(view2), KE::end(view2));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto mismatch(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view1,
-              const ::Kokkos::View<DataType2, Properties2...>& view2,
-              BinaryPredicateType&& predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1),
-                             KE::begin(view2), KE::end(view2),
-                             std::forward<BinaryPredicateType>(predicate));
-}
-
-// ----------------------------------
-// all_of public API
-// ----------------------------------
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool all_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
-            Predicate predicate) {
-  return Impl::all_of_impl("Kokkos::all_of_iterator_api_default", ex, first,
-                           last, predicate);
-}
-
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool all_of(const std::string& label, const ExecutionSpace& ex,
-            InputIterator first, InputIterator last, Predicate predicate) {
-  return Impl::all_of_impl(label, ex, first, last, predicate);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool all_of(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& v,
-            Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::all_of_impl("Kokkos::all_of_view_api_default", ex, KE::cbegin(v),
-                           KE::cend(v), std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool all_of(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& v,
-            Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::all_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                           std::move(predicate));
-}
-
-// ----------------------------------
-// any_of public API
-// ----------------------------------
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool any_of(const ExecutionSpace& ex, InputIterator first, InputIterator last,
-            Predicate predicate) {
-  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, first, last,
-                           predicate);
-}
-
-template <class ExecutionSpace, class InputIterator, class Predicate>
-bool any_of(const std::string& label, const ExecutionSpace& ex,
-            InputIterator first, InputIterator last, Predicate predicate) {
-  return Impl::any_of_impl(label, ex, first, last, predicate);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool any_of(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& v,
-            Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, KE::cbegin(v),
-                           KE::cend(v), std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool any_of(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& v,
-            Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::any_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                           std::move(predicate));
-}
-
-// ----------------------------------
-// none_of public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType, class Predicate>
-bool none_of(const ExecutionSpace& ex, IteratorType first, IteratorType last,
-             Predicate predicate) {
-  return Impl::none_of_impl("Kokkos::none_of_iterator_api_default", ex, first,
-                            last, predicate);
-}
-
-template <class ExecutionSpace, class IteratorType, class Predicate>
-bool none_of(const std::string& label, const ExecutionSpace& ex,
-             IteratorType first, IteratorType last, Predicate predicate) {
-  return Impl::none_of_impl(label, ex, first, last, predicate);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool none_of(const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& v,
-             Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::none_of_impl("Kokkos::none_of_view_api_default", ex,
-                            KE::cbegin(v), KE::cend(v), std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Predicate>
-bool none_of(const std::string& label, const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& v,
-             Predicate predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::none_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
-                            std::move(predicate));
-}
-
-// ----------------------------------
-// equal public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2) {
-  return Impl::equal_impl(label, ex, first1, last1, first2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, BinaryPredicateType predicate) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl(label, ex, first1, last1, first2,
-                          std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-bool equal(const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
-                          KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-bool equal(const std::string& label, const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-bool equal(const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
-           BinaryPredicateType predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl("Kokkos::equal_view_api_default", ex,
-                          KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2),
-                          std::move(predicate));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-bool equal(const std::string& label, const ExecutionSpace& ex,
-           const ::Kokkos::View<DataType1, Properties1...>& view1,
-           ::Kokkos::View<DataType2, Properties2...>& view2,
-           BinaryPredicateType predicate) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1),
-                          KE::cbegin(view2), std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, IteratorType2 last2) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
-  return Impl::equal_impl(label, ex, first1, last1, first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1,
-      IteratorType2 first2, IteratorType2 last2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1,
-                          last1, first2, last2, std::move(predicate));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      IteratorType1, IteratorType2>::value,
-                  bool>
-equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-      IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
-      BinaryPredicateType predicate) {
-  return Impl::equal_impl(label, ex, first1, last1, first2, last2,
-                          std::move(predicate));
-}
-
-// ----------------------------------
-// lexicographical_compare public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
-                             IteratorType1 last1, IteratorType2 first2,
-                             IteratorType2 last2) {
-  return Impl::lexicographical_compare_impl(
-      "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
-      first2, last2);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType1 first1, IteratorType1 last1,
-                             IteratorType2 first2, IteratorType2 last2) {
-  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
-                                            last2);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-bool lexicographical_compare(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(
-      "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
-      KE::cend(view1), KE::cbegin(view2), KE::cend(view2));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-bool lexicographical_compare(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
-                                            KE::cend(view1), KE::cbegin(view2),
-                                            KE::cend(view2));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ComparatorType>
-bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1,
-                             IteratorType1 last1, IteratorType2 first2,
-                             IteratorType2 last2, ComparatorType comp) {
-  return Impl::lexicographical_compare_impl(
-      "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1,
-      first2, last2, comp);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ComparatorType>
-bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType1 first1, IteratorType1 last1,
-                             IteratorType2 first2, IteratorType2 last2,
-                             ComparatorType comp) {
-  return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2,
-                                            last2, comp);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ComparatorType>
-bool lexicographical_compare(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(
-      "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1),
-      KE::cend(view1), KE::cbegin(view2), KE::cend(view2), comp);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ComparatorType>
-bool lexicographical_compare(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view1,
-    ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1),
-                                            KE::cend(view1), KE::cbegin(view2),
-                                            KE::cend(view2), comp);
-}
-
-// ----------------------------------
-// adjacent_find
-// ----------------------------------
-// overload set1
-template <class ExecutionSpace, class IteratorType>
-IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
-                           IteratorType last) {
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
-                                  ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last) {
-  return Impl::adjacent_find_impl(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto adjacent_find(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
-                                  KE::begin(v), KE::end(v));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType, Properties...>& v) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v));
-}
-
-// overload set2
-template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
-IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first,
-                           IteratorType last, BinaryPredicateType pred) {
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default",
-                                  ex, first, last, pred);
-}
-
-template <class ExecutionSpace, class IteratorType, class BinaryPredicateType>
-IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex,
-                           IteratorType first, IteratorType last,
-                           BinaryPredicateType pred) {
-  return Impl::adjacent_find_impl(label, ex, first, last, pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicateType>
-auto adjacent_find(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType, Properties...>& v,
-                   BinaryPredicateType pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex,
-                                  KE::begin(v), KE::end(v), pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicateType>
-auto adjacent_find(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType, Properties...>& v,
-                   BinaryPredicateType pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v), pred);
-}
-
-// ----------------------------------
-// search
-// ----------------------------------
-// overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
-                     IteratorType1 last, IteratorType2 s_first,
-                     IteratorType2 s_last) {
-  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
-                           last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
-                     IteratorType1 first, IteratorType1 last,
-                     IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::search_impl(label, ex, first, last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto search(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& view,
-            const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl("Kokkos::search_view_api_default", ex,
-                           KE::begin(view), KE::end(view), KE::begin(s_view),
-                           KE::end(s_view));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto search(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& view,
-            const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
-                           KE::begin(s_view), KE::end(s_view));
-}
-
-// overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
-                     IteratorType1 last, IteratorType2 s_first,
-                     IteratorType2 s_last, const BinaryPredicateType& pred) {
-  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
-                           last, s_first, s_last, pred);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
-                     IteratorType1 first, IteratorType1 last,
-                     IteratorType2 s_first, IteratorType2 s_last,
-                     const BinaryPredicateType& pred) {
-  return Impl::search_impl(label, ex, first, last, s_first, s_last, pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto search(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& view,
-            const ::Kokkos::View<DataType2, Properties2...>& s_view,
-            const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl("Kokkos::search_view_api_default", ex,
-                           KE::begin(view), KE::end(view), KE::begin(s_view),
-                           KE::end(s_view), pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto search(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& view,
-            const ::Kokkos::View<DataType2, Properties2...>& s_view,
-            const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
-                           KE::begin(s_view), KE::end(s_view), pred);
-}
-
-// ----------------------------------
-// find_first_of
-// ----------------------------------
-// overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
-                            IteratorType1 last, IteratorType2 s_first,
-                            IteratorType2 s_last) {
-  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
-                                  ex, first, last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto find_first_of(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& view,
-                   const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
-                                  KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto find_first_of(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& view,
-                   const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view));
-}
-
-// overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first,
-                            IteratorType1 last, IteratorType2 s_first,
-                            IteratorType2 s_last,
-                            const BinaryPredicateType& pred) {
-  return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default",
-                                  ex, first, last, s_first, s_last, pred);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 s_first, IteratorType2 s_last,
-                            const BinaryPredicateType& pred) {
-  return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last,
-                                  pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto find_first_of(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& view,
-                   const ::Kokkos::View<DataType2, Properties2...>& s_view,
-                   const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex,
-                                  KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view), pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto find_first_of(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& view,
-                   const ::Kokkos::View<DataType2, Properties2...>& s_view,
-                   const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view),
-                                  KE::begin(s_view), KE::end(s_view), pred);
-}
-
-// ----------------------------------
-// search_n
-// ----------------------------------
-// overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType>
-IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
-                      IteratorType last, SizeType count,
-                      const ValueType& value) {
-  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
-                             last, count, value);
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType>
-IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
-                      IteratorType first, IteratorType last, SizeType count,
-                      const ValueType& value) {
-  return Impl::search_n_impl(label, ex, first, last, count, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType>
-auto search_n(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              SizeType count, const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
-                             KE::begin(view), KE::end(view), count, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType>
-auto search_n(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              SizeType count, const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
-                             value);
-}
-
-// overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType, class BinaryPredicateType>
-IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
-                      IteratorType last, SizeType count, const ValueType& value,
-                      const BinaryPredicateType& pred) {
-  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
-                             last, count, value, pred);
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType,
-          class ValueType, class BinaryPredicateType>
-IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
-                      IteratorType first, IteratorType last, SizeType count,
-                      const ValueType& value, const BinaryPredicateType& pred) {
-  return Impl::search_n_impl(label, ex, first, last, count, value, pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType, class BinaryPredicateType>
-auto search_n(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              SizeType count, const ValueType& value,
-              const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
-                             KE::begin(view), KE::end(view), count, value,
-                             pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class ValueType, class BinaryPredicateType>
-auto search_n(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              SizeType count, const ValueType& value,
-              const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
-                             value, pred);
-}
-
-// ----------------------------------
-// find_end
-// ----------------------------------
-// overload set 1: no binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
-                       IteratorType1 last, IteratorType2 s_first,
-                       IteratorType2 s_last) {
-  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
-                             last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
-                       IteratorType1 first, IteratorType1 last,
-                       IteratorType2 s_first, IteratorType2 s_last) {
-  return Impl::find_end_impl(label, ex, first, last, s_first, s_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto find_end(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view,
-              const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
-                             KE::begin(view), KE::end(view), KE::begin(s_view),
-                             KE::end(s_view));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto find_end(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view,
-              const ::Kokkos::View<DataType2, Properties2...>& s_view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
-                             KE::begin(s_view), KE::end(s_view));
-}
-
-// overload set 2: binary predicate passed
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first,
-                       IteratorType1 last, IteratorType2 s_first,
-                       IteratorType2 s_last, const BinaryPredicateType& pred) {
-  return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first,
-                             last, s_first, s_last, pred);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class BinaryPredicateType>
-IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex,
-                       IteratorType1 first, IteratorType1 last,
-                       IteratorType2 s_first, IteratorType2 s_last,
-                       const BinaryPredicateType& pred) {
-  return Impl::find_end_impl(label, ex, first, last, s_first, s_last, pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto find_end(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view,
-              const ::Kokkos::View<DataType2, Properties2...>& s_view,
-              const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex,
-                             KE::begin(view), KE::end(view), KE::begin(s_view),
-                             KE::end(s_view), pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicateType>
-auto find_end(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType1, Properties1...>& view,
-              const ::Kokkos::View<DataType2, Properties2...>& s_view,
-              const BinaryPredicateType& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view),
-                             KE::begin(s_view), KE::end(s_view), pred);
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
new file mode 100644
index 000000000..30ffb5244
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_NONE_OF_HPP
+#define KOKKOS_STD_ALGORITHMS_NONE_OF_HPP
+
+#include "impl/Kokkos_AllOfAnyOfNoneOf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+bool none_of(const ExecutionSpace& ex, IteratorType first, IteratorType last,
+             Predicate predicate) {
+  return Impl::none_of_impl("Kokkos::none_of_iterator_api_default", ex, first,
+                            last, predicate);
+}
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+bool none_of(const std::string& label, const ExecutionSpace& ex,
+             IteratorType first, IteratorType last, Predicate predicate) {
+  return Impl::none_of_impl(label, ex, first, last, predicate);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool none_of(const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& v,
+             Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::none_of_impl("Kokkos::none_of_view_api_default", ex,
+                            KE::cbegin(v), KE::cend(v), std::move(predicate));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class Predicate>
+bool none_of(const std::string& label, const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& v,
+             Predicate predicate) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::none_of_impl(label, ex, KE::cbegin(v), KE::cend(v),
+                            std::move(predicate));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
new file mode 100644
index 000000000..5b00669fd
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_PARTITION_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_PARTITION_COPY_HPP
+
+#include "impl/Kokkos_PartitionCopy.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorTrueType, class OutputIteratorFalseType,
+          class PredicateType>
+::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
+    const ExecutionSpace& ex, InputIteratorType from_first,
+    InputIteratorType from_last, OutputIteratorTrueType to_first_true,
+    OutputIteratorFalseType to_first_false, PredicateType p) {
+  return Impl::partition_copy_impl(
+      "Kokkos::partition_copy_iterator_api_default", ex, from_first, from_last,
+      to_first_true, to_first_false, std::move(p));
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorTrueType, class OutputIteratorFalseType,
+          class PredicateType>
+::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType from_first, InputIteratorType from_last,
+    OutputIteratorTrueType to_first_true,
+    OutputIteratorFalseType to_first_false, PredicateType p) {
+  return Impl::partition_copy_impl(label, ex, from_first, from_last,
+                                   to_first_true, to_first_false, std::move(p));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class DataType3,
+          class... Properties3, class PredicateType>
+auto partition_copy(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
+    const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
+    PredicateType p) {
+  return Impl::partition_copy_impl("Kokkos::partition_copy_view_api_default",
+                                   ex, cbegin(view_from), cend(view_from),
+                                   begin(view_dest_true),
+                                   begin(view_dest_false), std::move(p));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class DataType3,
+          class... Properties3, class PredicateType>
+auto partition_copy(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
+    const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
+    PredicateType p) {
+  return Impl::partition_copy_impl(label, ex, cbegin(view_from),
+                                   cend(view_from), begin(view_dest_true),
+                                   begin(view_dest_false), std::move(p));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
new file mode 100644
index 000000000..b714d5a27
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp
@@ -0,0 +1,91 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_PARTITION_POINT_HPP
+#define KOKKOS_STD_ALGORITHMS_PARTITION_POINT_HPP
+
+#include "impl/Kokkos_PartitionPoint.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
+IteratorType partition_point(const ExecutionSpace& ex, IteratorType first,
+                             IteratorType last, UnaryPredicate p) {
+  return Impl::partition_point_impl(
+      "Kokkos::partitioned_point_iterator_api_default", ex, first, last,
+      std::move(p));
+}
+
+template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
+IteratorType partition_point(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last,
+                             UnaryPredicate p) {
+  return Impl::partition_point_impl(label, ex, first, last, std::move(p));
+}
+
+template <class ExecutionSpace, class UnaryPredicate, class DataType,
+          class... Properties>
+auto partition_point(const std::string& label, const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& v,
+                     UnaryPredicate p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  return Impl::partition_point_impl(label, ex, begin(v), end(v), std::move(p));
+}
+
+template <class ExecutionSpace, class UnaryPredicate, class DataType,
+          class... Properties>
+auto partition_point(const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType, Properties...>& v,
+                     UnaryPredicate p) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
+  return Impl::partition_point_impl("Kokkos::partition_point_view_api_default",
+                                    ex, begin(v), end(v), std::move(p));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitioningOperations.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitioningOperations.hpp
deleted file mode 100644
index 9806084fc..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_PartitioningOperations.hpp
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STD_PARTITIONING_OPERATIONS_HPP
-#define KOKKOS_STD_PARTITIONING_OPERATIONS_HPP
-
-#include <Kokkos_Core.hpp>
-#include "Kokkos_BeginEnd.hpp"
-#include "Kokkos_Constraints.hpp"
-#include "Kokkos_ModifyingOperations.hpp"
-#include "Kokkos_NonModifyingSequenceOperations.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-// -------------------------
-//
-// functors
-//
-// -------------------------
-
-template <class IteratorType, class ReducerType, class PredicateType>
-struct StdIsPartitionedFunctor {
-  using red_value_type = typename ReducerType::value_type;
-  using index_type     = typename IteratorType::difference_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& redValue) const {
-    const auto predicate_value = m_p(m_first[i]);
-    constexpr index_type m_red_id_min =
-        ::Kokkos::reduction_identity<index_type>::min();
-    constexpr index_type m_red_id_max =
-        ::Kokkos::reduction_identity<index_type>::max();
-    auto rv = predicate_value ? red_value_type{i, m_red_id_min}
-                              : red_value_type{m_red_id_max, i};
-
-    m_reducer.join(redValue, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdIsPartitionedFunctor(IteratorType first, ReducerType reducer,
-                          PredicateType p)
-      : m_first(std::move(first)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class IteratorType, class ReducerType, class PredicateType>
-struct StdPartitionPointFunctor {
-  using red_value_type = typename ReducerType::value_type;
-  using index_type     = typename IteratorType::difference_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-  PredicateType m_p;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& redValue) const {
-    const auto predicate_value = m_p(m_first[i]);
-    auto rv =
-        predicate_value
-            ? red_value_type{::Kokkos::reduction_identity<index_type>::min()}
-            : red_value_type{i};
-    m_reducer.join(redValue, rv);
-  }
-
-  KOKKOS_FUNCTION
-  StdPartitionPointFunctor(IteratorType first, ReducerType reducer,
-                           PredicateType p)
-      : m_first(std::move(first)),
-        m_reducer(std::move(reducer)),
-        m_p(std::move(p)) {}
-};
-
-template <class ValueType>
-struct StdPartitionCopyScalar {
-  ValueType true_count_;
-  ValueType false_count_;
-
-  // Here we implement the copy assignment operators explicitly for consistency
-  // with how the Scalar structs are implemented inside
-  // Kokkos_Parallel_Reduce.hpp.
-  KOKKOS_FUNCTION
-  void operator=(const StdPartitionCopyScalar& other) {
-    true_count_  = other.true_count_;
-    false_count_ = other.false_count_;
-  }
-
-  KOKKOS_FUNCTION
-  void operator=(const volatile StdPartitionCopyScalar& other) volatile {
-    true_count_  = other.true_count_;
-    false_count_ = other.false_count_;
-  }
-
-  // this is needed for
-  // OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp:699:21: error: no viable
-  // overloaded '=' m_returnvalue = 0;
-  //
-  KOKKOS_FUNCTION
-  void operator=(const ValueType value) {
-    true_count_  = value;
-    false_count_ = value;
-  }
-};
-
-template <class IndexType, class FirstFrom, class FirstDestTrue,
-          class FirstDestFalse, class PredType>
-struct StdPartitionCopyFunctor {
-  using value_type = StdPartitionCopyScalar<IndexType>;
-
-  FirstFrom m_first_from;
-  FirstDestTrue m_first_dest_true;
-  FirstDestFalse m_first_dest_false;
-  PredType m_pred;
-
-  KOKKOS_FUNCTION
-  StdPartitionCopyFunctor(FirstFrom first_from, FirstDestTrue first_dest_true,
-                          FirstDestFalse first_dest_false, PredType pred)
-      : m_first_from(std::move(first_from)),
-        m_first_dest_true(std::move(first_dest_true)),
-        m_first_dest_false(std::move(first_dest_false)),
-        m_pred(std::move(pred)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    const auto& myval = m_first_from[i];
-    if (final_pass) {
-      if (m_pred(myval)) {
-        m_first_dest_true[update.true_count_] = myval;
-      } else {
-        m_first_dest_false[update.false_count_] = myval;
-      }
-    }
-
-    if (m_pred(myval)) {
-      update.true_count_ += 1;
-    } else {
-      update.false_count_ += 1;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.true_count_  = 0;
-    update.false_count_ = 0;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    update.true_count_ += input.true_count_;
-    update.false_count_ += input.false_count_;
-  }
-};
-
-// ------------------------------------------
-// is_partitioned_impl
-// ------------------------------------------
-
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         PredicateType pred) {
-  // true if all elements in the range [first, last) that satisfy
-  // the predicate "pred" appear before all elements that don't.
-  // Also returns true if [first, last) is empty.
-  // also true if all elements satisfy the predicate.
-
-  // we implement it by finding:
-  // - the max location where predicate is true  (max_loc_true)
-  // - the min location where predicate is false (min_loc_false)
-  // so the range is partitioned if max_loc_true < (min_loc_false)
-
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // trivial case
-  if (first == last) {
-    return true;
-  }
-
-  // aliases
-  using index_type           = typename IteratorType::difference_type;
-  using reducer_type         = StdIsPartitioned<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t =
-      StdIsPartitionedFunctor<IteratorType, reducer_type, PredicateType>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, reducer, pred), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // decide and return
-  constexpr index_type red_id_min =
-      ::Kokkos::reduction_identity<index_type>::min();
-  constexpr index_type red_id_max =
-      ::Kokkos::reduction_identity<index_type>::max();
-
-  if (red_result.max_loc_true != red_id_max &&
-      red_result.min_loc_false != red_id_min) {
-    return red_result.max_loc_true < red_result.min_loc_false;
-  } else if (first + red_result.max_loc_true == --last) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// ------------------------------------------
-// partition_point_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType partition_point_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last, PredicateType pred) {
-  // locates the end of the first partition, that is, the first
-  // element that does not satisfy p or last if all elements satisfy p.
-  // Implementation below finds the first location where p is false.
-
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return first;
-  }
-
-  // aliases
-  using index_type           = typename IteratorType::difference_type;
-  using reducer_type         = StdPartitionPoint<index_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-  using func_t =
-      StdPartitionPointFunctor<IteratorType, reducer_type, PredicateType>;
-
-  // run
-  reduction_value_type red_result;
-  reducer_type reducer(red_result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_t(first, reducer, pred), reducer);
-
-  // fence not needed because reducing into scalar
-
-  // decide and return
-  if (red_result.min_loc_false ==
-      ::Kokkos::reduction_identity<index_type>::min()) {
-    // if all elements are true, return last
-    return last;
-  } else {
-    return first + red_result.min_loc_false;
-  }
-}
-
-// ------------------------------------------
-// partition_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorTrueType, class OutputIteratorFalseType,
-          class PredicateType>
-::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType>
-partition_copy_impl(const std::string& label, const ExecutionSpace& ex,
-                    InputIteratorType from_first, InputIteratorType from_last,
-                    OutputIteratorTrueType to_first_true,
-                    OutputIteratorFalseType to_first_false,
-                    PredicateType pred) {
-  // impl uses a scan, this is similar how we implemented copy_if
-
-  // checks
-  Impl::static_assert_random_access_and_accessible(
-      ex, from_first, to_first_true, to_first_false);
-  Impl::static_assert_iterators_have_matching_difference_type(
-      from_first, to_first_true, to_first_false);
-  Impl::expect_valid_range(from_first, from_last);
-
-  if (from_first == from_last) {
-    return {to_first_true, to_first_false};
-  }
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_type =
-      StdPartitionCopyFunctor<index_type, InputIteratorType,
-                              OutputIteratorTrueType, OutputIteratorFalseType,
-                              PredicateType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(from_first, from_last);
-  typename func_type::value_type counts{0, 0};
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(from_first, to_first_true, to_first_false, pred), counts);
-
-  // fence not needed here because of the scan into counts
-
-  return {to_first_true + counts.true_count_,
-          to_first_false + counts.false_count_};
-}
-
-}  // end namespace Impl
-
-// ----------------------
-// is_partitioned public API
-// ----------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-bool is_partitioned(const ExecutionSpace& ex, IteratorType first,
-                    IteratorType last, PredicateType p) {
-  return Impl::is_partitioned_impl(
-      "Kokkos::is_partitioned_iterator_api_default", ex, first, last,
-      std::move(p));
-}
-
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last, PredicateType p) {
-  return Impl::is_partitioned_impl(label, ex, first, last, std::move(p));
-}
-
-template <class ExecutionSpace, class PredicateType, class DataType,
-          class... Properties>
-bool is_partitioned(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v,
-                    PredicateType p) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::is_partitioned_impl("Kokkos::is_partitioned_view_api_default",
-                                   ex, cbegin(v), cend(v), std::move(p));
-}
-
-template <class ExecutionSpace, class PredicateType, class DataType,
-          class... Properties>
-bool is_partitioned(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType, Properties...>& v,
-                    PredicateType p) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-
-  return Impl::is_partitioned_impl(label, ex, cbegin(v), cend(v), std::move(p));
-}
-
-// ----------------------
-// partition_copy
-// ----------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorTrueType, class OutputIteratorFalseType,
-          class PredicateType>
-::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
-    const ExecutionSpace& ex, InputIteratorType from_first,
-    InputIteratorType from_last, OutputIteratorTrueType to_first_true,
-    OutputIteratorFalseType to_first_false, PredicateType p) {
-  return Impl::partition_copy_impl(
-      "Kokkos::partition_copy_iterator_api_default", ex, from_first, from_last,
-      to_first_true, to_first_false, std::move(p));
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorTrueType, class OutputIteratorFalseType,
-          class PredicateType>
-::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType> partition_copy(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType from_first, InputIteratorType from_last,
-    OutputIteratorTrueType to_first_true,
-    OutputIteratorFalseType to_first_false, PredicateType p) {
-  return Impl::partition_copy_impl(label, ex, from_first, from_last,
-                                   to_first_true, to_first_false, std::move(p));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class PredicateType>
-auto partition_copy(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
-    const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
-    PredicateType p) {
-  return Impl::partition_copy_impl("Kokkos::partition_copy_view_api_default",
-                                   ex, cbegin(view_from), cend(view_from),
-                                   begin(view_dest_true),
-                                   begin(view_dest_false), std::move(p));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class PredicateType>
-auto partition_copy(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest_true,
-    const ::Kokkos::View<DataType3, Properties3...>& view_dest_false,
-    PredicateType p) {
-  return Impl::partition_copy_impl(label, ex, cbegin(view_from),
-                                   cend(view_from), begin(view_dest_true),
-                                   begin(view_dest_false), std::move(p));
-}
-
-// ----------------------
-// partition_point
-// ----------------------
-template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
-IteratorType partition_point(const ExecutionSpace& ex, IteratorType first,
-                             IteratorType last, UnaryPredicate p) {
-  return Impl::partition_point_impl(
-      "Kokkos::partitioned_point_iterator_api_default", ex, first, last,
-      std::move(p));
-}
-
-template <class ExecutionSpace, class IteratorType, class UnaryPredicate>
-IteratorType partition_point(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, IteratorType last,
-                             UnaryPredicate p) {
-  return Impl::partition_point_impl(label, ex, first, last, std::move(p));
-}
-
-template <class ExecutionSpace, class UnaryPredicate, class DataType,
-          class... Properties>
-auto partition_point(const std::string& label, const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& v,
-                     UnaryPredicate p) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  return Impl::partition_point_impl(label, ex, begin(v), end(v), std::move(p));
-}
-
-template <class ExecutionSpace, class UnaryPredicate, class DataType,
-          class... Properties>
-auto partition_point(const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& v,
-                     UnaryPredicate p) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v);
-  return Impl::partition_point_impl("Kokkos::partition_point_view_api_default",
-                                    ex, begin(v), end(v), std::move(p));
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
similarity index 70%
rename from packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_Reduce.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
index bf03f6e98..3cf915320 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_Reduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp
@@ -42,119 +42,14 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_NUMERICS_REDUCE_HPP
-#define KOKKOS_STD_NUMERICS_REDUCE_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_REDUCE_HPP
+#define KOKKOS_STD_ALGORITHMS_REDUCE_HPP
 
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_Distance.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp"
+#include "impl/Kokkos_Reduce.hpp"
+#include "Kokkos_BeginEnd.hpp"
 
 namespace Kokkos {
 namespace Experimental {
-namespace Impl {
-
-template <class ValueType>
-struct StdReduceDefaultJoinFunctor {
-  KOKKOS_FUNCTION
-  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
-    return a + b;
-  }
-
-  KOKKOS_FUNCTION
-  constexpr ValueType operator()(const volatile ValueType& a,
-                                 const volatile ValueType& b) const {
-    return a + b;
-  }
-};
-
-template <class IteratorType, class ReducerType>
-struct StdReduceFunctor {
-  using red_value_type = typename ReducerType::value_type;
-  using index_type     = typename IteratorType::difference_type;
-
-  const IteratorType m_first;
-  const ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& red_value) const {
-    auto tmp_wrapped_value = red_value_type{m_first[i], false};
-
-    if (red_value.is_initial) {
-      red_value = tmp_wrapped_value;
-    } else {
-      m_reducer.join(red_value, tmp_wrapped_value);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdReduceFunctor(IteratorType first, ReducerType reducer)
-      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
-};
-
-//------------------------------
-// reduce_custom_functors_impl
-//------------------------------
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class JoinerType>
-ValueType reduce_custom_functors_impl(const std::string& label,
-                                      const ExecutionSpace& ex,
-                                      IteratorType first, IteratorType last,
-                                      ValueType init_reduction_value,
-                                      JoinerType joiner) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    // init is returned, unmodified
-    return init_reduction_value;
-  }
-
-  // aliases
-  using reducer_type =
-      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
-  using functor_type         = StdReduceFunctor<IteratorType, reducer_type>;
-  using reduction_value_type = typename reducer_type::value_type;
-
-  // run
-  reduction_value_type result;
-  reducer_type reducer(result, joiner);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            functor_type(first, reducer), reducer);
-
-  // fence not needed since reducing into scalar
-  return joiner(result.val, init_reduction_value);
-}
-
-template <class ExecutionSpace, class IteratorType, class ValueType>
-ValueType reduce_default_functors_impl(const std::string& label,
-                                       const ExecutionSpace& ex,
-                                       IteratorType first, IteratorType last,
-                                       ValueType init_reduction_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::expect_valid_range(first, last);
-
-  using value_type  = Kokkos::Impl::remove_cvref_t<ValueType>;
-  using joiner_type = Impl::StdReduceDefaultJoinFunctor<value_type>;
-  return reduce_custom_functors_impl(
-      label, ex, first, last, std::move(init_reduction_value), joiner_type());
-}
-
-}  // end namespace Impl
-
-///////////////////////////////
-//
-// reduce public API
-//
-///////////////////////////////
 
 //
 // overload set 1
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
new file mode 100644
index 000000000..d8d7c999b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Remove.hpp
@@ -0,0 +1,91 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REMOVE_HPP
+#define KOKKOS_STD_ALGORITHMS_REMOVE_HPP
+
+#include "impl/Kokkos_RemoveAllVariants.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class Iterator, class ValueType>
+Iterator remove(const ExecutionSpace& ex, Iterator first, Iterator last,
+                const ValueType& value) {
+  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex, first,
+                           last, value);
+}
+
+template <class ExecutionSpace, class Iterator, class ValueType>
+Iterator remove(const std::string& label, const ExecutionSpace& ex,
+                Iterator first, Iterator last, const ValueType& value) {
+  return Impl::remove_impl(label, ex, first, last, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ValueType>
+auto remove(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex,
+                           ::Kokkos::Experimental::begin(view),
+                           ::Kokkos::Experimental::end(view), value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class ValueType>
+auto remove(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::remove_impl(label, ex, ::Kokkos::Experimental::begin(view),
+                           ::Kokkos::Experimental::end(view), value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
new file mode 100644
index 000000000..7d5c163af
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp
@@ -0,0 +1,106 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REMOVE_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_REMOVE_COPY_HPP
+
+#include "impl/Kokkos_RemoveAllVariants.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class ValueType>
+OutputIterator remove_copy(const ExecutionSpace& ex, InputIterator first_from,
+                           InputIterator last_from, OutputIterator first_dest,
+                           const ValueType& value) {
+  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
+                                first_from, last_from, first_dest, value);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class ValueType>
+OutputIterator remove_copy(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first_from, InputIterator last_from,
+                           OutputIterator first_dest, const ValueType& value) {
+  return Impl::remove_copy_impl(label, ex, first_from, last_from, first_dest,
+                                value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto remove_copy(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                 const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                 const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
+                                ::Kokkos::Experimental::cbegin(view_from),
+                                ::Kokkos::Experimental::cend(view_from),
+                                ::Kokkos::Experimental::begin(view_dest),
+                                value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto remove_copy(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                 const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                 const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_impl(
+      label, ex, ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
new file mode 100644
index 000000000..8a9a3e4c1
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REMOVE_COPY_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_REMOVE_COPY_IF_HPP
+
+#include "impl/Kokkos_RemoveAllVariants.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class UnaryPredicate>
+OutputIterator remove_copy_if(const ExecutionSpace& ex,
+                              InputIterator first_from, InputIterator last_from,
+                              OutputIterator first_dest,
+                              const UnaryPredicate& pred) {
+  return Impl::remove_copy_if_impl(
+      "Kokkos::remove_copy_if_iterator_api_default", ex, first_from, last_from,
+      first_dest, pred);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class UnaryPredicate>
+OutputIterator remove_copy_if(const std::string& label,
+                              const ExecutionSpace& ex,
+                              InputIterator first_from, InputIterator last_from,
+                              OutputIterator first_dest,
+                              const UnaryPredicate& pred) {
+  return Impl::remove_copy_if_impl(label, ex, first_from, last_from, first_dest,
+                                   pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class UnaryPredicate>
+auto remove_copy_if(const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    const UnaryPredicate& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_if_impl(
+      "Kokkos::remove_copy_if_iterator_api_default", ex,
+      ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class UnaryPredicate>
+auto remove_copy_if(const std::string& label, const ExecutionSpace& ex,
+                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                    const UnaryPredicate& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+
+  return Impl::remove_copy_if_impl(
+      label, ex, ::Kokkos::Experimental::cbegin(view_from),
+      ::Kokkos::Experimental::cend(view_from),
+      ::Kokkos::Experimental::begin(view_dest), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
new file mode 100644
index 000000000..e4171ca91
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REMOVE_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_REMOVE_IF_HPP
+
+#include "impl/Kokkos_RemoveAllVariants.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class Iterator, class UnaryPredicate>
+Iterator remove_if(const ExecutionSpace& ex, Iterator first, Iterator last,
+                   UnaryPredicate pred) {
+  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
+                              first, last, pred);
+}
+
+template <class ExecutionSpace, class Iterator, class UnaryPredicate>
+Iterator remove_if(const std::string& label, const ExecutionSpace& ex,
+                   Iterator first, Iterator last, UnaryPredicate pred) {
+  return Impl::remove_if_impl(label, ex, first, last, pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class UnaryPredicate>
+auto remove_if(const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view,
+               UnaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
+                              ::Kokkos::Experimental::begin(view),
+                              ::Kokkos::Experimental::end(view), pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class UnaryPredicate>
+auto remove_if(const std::string& label, const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType, Properties...>& view,
+               UnaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::remove_if_impl(label, ex, ::Kokkos::Experimental::begin(view),
+                              ::Kokkos::Experimental::end(view), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
new file mode 100644
index 000000000..10ca46af2
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Replace.hpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_HPP
+
+#include "impl/Kokkos_Replace.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class Iterator, class ValueType>
+void replace(const ExecutionSpace& ex, Iterator first, Iterator last,
+             const ValueType& old_value, const ValueType& new_value) {
+  return Impl::replace_impl("Kokkos::replace_iterator_api", ex, first, last,
+                            old_value, new_value);
+}
+
+template <class ExecutionSpace, class Iterator, class ValueType>
+void replace(const std::string& label, const ExecutionSpace& ex, Iterator first,
+             Iterator last, const ValueType& old_value,
+             const ValueType& new_value) {
+  return Impl::replace_impl(label, ex, first, last, old_value, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class ValueType>
+void replace(const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType1, Properties1...>& view,
+             const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_impl("Kokkos::replace_view_api", ex, KE::begin(view),
+                            KE::end(view), old_value, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class ValueType>
+void replace(const std::string& label, const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType1, Properties1...>& view,
+             const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_impl(label, ex, KE::begin(view), KE::end(view),
+                            old_value, new_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
new file mode 100644
index 000000000..f5136eb43
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_COPY_HPP
+
+#include "impl/Kokkos_ReplaceCopy.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class ValueType>
+OutputIterator replace_copy(const ExecutionSpace& ex, InputIterator first_from,
+                            InputIterator last_from, OutputIterator first_dest,
+                            const ValueType& old_value,
+                            const ValueType& new_value) {
+  return Impl::replace_copy_impl("Kokkos::replace_copy_iterator_api", ex,
+                                 first_from, last_from, first_dest, old_value,
+                                 new_value);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class ValueType>
+OutputIterator replace_copy(const std::string& label, const ExecutionSpace& ex,
+                            InputIterator first_from, InputIterator last_from,
+                            OutputIterator first_dest,
+                            const ValueType& old_value,
+                            const ValueType& new_value) {
+  return Impl::replace_copy_impl(label, ex, first_from, last_from, first_dest,
+                                 old_value, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto replace_copy(const ExecutionSpace& ex,
+                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                  const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_impl("Kokkos::replace_copy_view_api", ex,
+                                 KE::cbegin(view_from), KE::cend(view_from),
+                                 KE::begin(view_dest), old_value, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType>
+auto replace_copy(const std::string& label, const ExecutionSpace& ex,
+                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                  const ValueType& old_value, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_impl(label, ex, KE::cbegin(view_from),
+                                 KE::cend(view_from), KE::begin(view_dest),
+                                 old_value, new_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
new file mode 100644
index 000000000..a3f3fe69a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IF_HPP
+
+#include "impl/Kokkos_ReplaceCopyIf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class PredicateType, class ValueType>
+OutputIterator replace_copy_if(const ExecutionSpace& ex,
+                               InputIterator first_from,
+                               InputIterator last_from,
+                               OutputIterator first_dest, PredicateType pred,
+                               const ValueType& new_value) {
+  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_iterator_api", ex,
+                                    first_from, last_from, first_dest, pred,
+                                    new_value);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class PredicateType, class ValueType>
+OutputIterator replace_copy_if(const std::string& label,
+                               const ExecutionSpace& ex,
+                               InputIterator first_from,
+                               InputIterator last_from,
+                               OutputIterator first_dest, PredicateType pred,
+                               const ValueType& new_value) {
+  return Impl::replace_copy_if_impl(label, ex, first_from, last_from,
+                                    first_dest, pred, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class PredicateType,
+          class ValueType>
+auto replace_copy_if(const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                     PredicateType pred, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_view_api", ex,
+                                    KE::cbegin(view_from), KE::cend(view_from),
+                                    KE::begin(view_dest), pred, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class PredicateType,
+          class ValueType>
+auto replace_copy_if(const std::string& label, const ExecutionSpace& ex,
+                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
+                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+                     PredicateType pred, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_copy_if_impl(label, ex, KE::cbegin(view_from),
+                                    KE::cend(view_from), KE::begin(view_dest),
+                                    pred, new_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
new file mode 100644
index 000000000..bdb59f28a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_IF_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_IF_HPP
+
+#include "impl/Kokkos_ReplaceIf.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class Predicate,
+          class ValueType>
+void replace_if(const ExecutionSpace& ex, InputIterator first,
+                InputIterator last, Predicate pred,
+                const ValueType& new_value) {
+  return Impl::replace_if_impl("Kokkos::replace_if_iterator_api", ex, first,
+                               last, pred, new_value);
+}
+
+template <class ExecutionSpace, class InputIterator, class Predicate,
+          class ValueType>
+void replace_if(const std::string& label, const ExecutionSpace& ex,
+                InputIterator first, InputIterator last, Predicate pred,
+                const ValueType& new_value) {
+  return Impl::replace_if_impl(label, ex, first, last, pred, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class Predicate, class ValueType>
+void replace_if(const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType1, Properties1...>& view,
+                Predicate pred, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_if_impl("Kokkos::replace_if_view_api", ex,
+                               KE::begin(view), KE::end(view), pred, new_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class Predicate, class ValueType>
+void replace_if(const std::string& label, const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType1, Properties1...>& view,
+                Predicate pred, const ValueType& new_value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::replace_if_impl(label, ex, KE::begin(view), KE::end(view), pred,
+                               new_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
new file mode 100644
index 000000000..4848b20f6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reverse.hpp
@@ -0,0 +1,87 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REVERSE_HPP
+#define KOKKOS_STD_ALGORITHMS_REVERSE_HPP
+
+#include "impl/Kokkos_Reverse.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator>
+void reverse(const ExecutionSpace& ex, InputIterator first,
+             InputIterator last) {
+  return Impl::reverse_impl("Kokkos::reverse_iterator_api_default", ex, first,
+                            last);
+}
+
+template <class ExecutionSpace, class InputIterator>
+void reverse(const std::string& label, const ExecutionSpace& ex,
+             InputIterator first, InputIterator last) {
+  return Impl::reverse_impl(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+void reverse(const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::reverse_impl("Kokkos::reverse_view_api_default", ex,
+                            KE::begin(view), KE::end(view));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+void reverse(const std::string& label, const ExecutionSpace& ex,
+             const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::reverse_impl(label, ex, KE::begin(view), KE::end(view));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
new file mode 100644
index 000000000..bb4462bf4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REVERSE_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_REVERSE_COPY_HPP
+
+#include "impl/Kokkos_ReverseCopy.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator reverse_copy(const ExecutionSpace& ex, InputIterator first,
+                            InputIterator last, OutputIterator d_first) {
+  return Impl::reverse_copy_impl("Kokkos::reverse_copy_iterator_api_default",
+                                 ex, first, last, d_first);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator reverse_copy(const std::string& label, const ExecutionSpace& ex,
+                            InputIterator first, InputIterator last,
+                            OutputIterator d_first) {
+  return Impl::reverse_copy_impl(label, ex, first, last, d_first);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto reverse_copy(const ExecutionSpace& ex,
+                  const ::Kokkos::View<DataType1, Properties1...>& source,
+                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::reverse_copy_impl("Kokkos::reverse_copy_view_api_default", ex,
+                                 cbegin(source), cend(source), begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto reverse_copy(const std::string& label, const ExecutionSpace& ex,
+                  const ::Kokkos::View<DataType1, Properties1...>& source,
+                  ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::reverse_copy_impl(label, ex, cbegin(source), cend(source),
+                                 begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
new file mode 100644
index 000000000..39975811a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Rotate.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ROTATE_HPP
+#define KOKKOS_STD_ALGORITHMS_ROTATE_HPP
+
+#include "impl/Kokkos_Rotate.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType rotate(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType n_first, IteratorType last) {
+  return Impl::rotate_impl("Kokkos::rotate_iterator_api_default", ex, first,
+                           n_first, last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType rotate(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType n_first,
+                    IteratorType last) {
+  return Impl::rotate_impl(label, ex, first, n_first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto rotate(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            std::size_t n_location) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::rotate_impl("Kokkos::rotate_view_api_default", ex, begin(view),
+                           begin(view) + n_location, end(view));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto rotate(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            std::size_t n_location) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::rotate_impl(label, ex, begin(view), begin(view) + n_location,
+                           end(view));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
new file mode 100644
index 000000000..f98686ab6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ROTATE_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_ROTATE_COPY_HPP
+
+#include "impl/Kokkos_RotateCopy.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator rotate_copy(const ExecutionSpace& ex, InputIterator first,
+                           InputIterator n_first, InputIterator last,
+                           OutputIterator d_first) {
+  return Impl::rotate_copy_impl("Kokkos::rotate_copy_iterator_api_default", ex,
+                                first, n_first, last, d_first);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator rotate_copy(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first, InputIterator n_first,
+                           InputIterator last, OutputIterator d_first) {
+  return Impl::rotate_copy_impl(label, ex, first, n_first, last, d_first);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto rotate_copy(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 std::size_t n_location,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::rotate_copy_impl("Kokkos::rotate_copy_view_api_default", ex,
+                                cbegin(source), cbegin(source) + n_location,
+                                cend(source), begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto rotate_copy(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 std::size_t n_location,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::rotate_copy_impl(label, ex, cbegin(source),
+                                cbegin(source) + n_location, cend(source),
+                                begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
new file mode 100644
index 000000000..ce656da31
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Search.hpp
@@ -0,0 +1,148 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SEARCH_HPP
+#define KOKKOS_STD_ALGORITHMS_SEARCH_HPP
+
+#include "impl/Kokkos_Search.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1: no binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
+                     IteratorType1 last, IteratorType2 s_first,
+                     IteratorType2 s_last) {
+  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
+                           last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
+                     IteratorType1 first, IteratorType1 last,
+                     IteratorType2 s_first, IteratorType2 s_last) {
+  return Impl::search_impl(label, ex, first, last, s_first, s_last);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto search(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& view,
+            const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_impl("Kokkos::search_view_api_default", ex,
+                           KE::begin(view), KE::end(view), KE::begin(s_view),
+                           KE::end(s_view));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto search(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& view,
+            const ::Kokkos::View<DataType2, Properties2...>& s_view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
+                           KE::begin(s_view), KE::end(s_view));
+}
+
+// overload set 2: binary predicate passed
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first,
+                     IteratorType1 last, IteratorType2 s_first,
+                     IteratorType2 s_last, const BinaryPredicateType& pred) {
+  return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first,
+                           last, s_first, s_last, pred);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 search(const std::string& label, const ExecutionSpace& ex,
+                     IteratorType1 first, IteratorType1 last,
+                     IteratorType2 s_first, IteratorType2 s_last,
+                     const BinaryPredicateType& pred) {
+  return Impl::search_impl(label, ex, first, last, s_first, s_last, pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto search(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& view,
+            const ::Kokkos::View<DataType2, Properties2...>& s_view,
+            const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_impl("Kokkos::search_view_api_default", ex,
+                           KE::begin(view), KE::end(view), KE::begin(s_view),
+                           KE::end(s_view), pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicateType>
+auto search(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType1, Properties1...>& view,
+            const ::Kokkos::View<DataType2, Properties2...>& s_view,
+            const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_impl(label, ex, KE::begin(view), KE::end(view),
+                           KE::begin(s_view), KE::end(s_view), pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
new file mode 100644
index 000000000..854d911e7
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SearchN.hpp
@@ -0,0 +1,144 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SEARCH_N_HPP
+#define KOKKOS_STD_ALGORITHMS_SEARCH_N_HPP
+
+#include "impl/Kokkos_SearchN.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1: no binary predicate passed
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType>
+IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
+                      IteratorType last, SizeType count,
+                      const ValueType& value) {
+  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
+                             last, count, value);
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType>
+IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
+                      IteratorType first, IteratorType last, SizeType count,
+                      const ValueType& value) {
+  return Impl::search_n_impl(label, ex, first, last, count, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class ValueType>
+auto search_n(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              SizeType count, const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
+                             KE::begin(view), KE::end(view), count, value);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class ValueType>
+auto search_n(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              SizeType count, const ValueType& value) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
+                             value);
+}
+
+// overload set 2: binary predicate passed
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType, class BinaryPredicateType>
+IteratorType search_n(const ExecutionSpace& ex, IteratorType first,
+                      IteratorType last, SizeType count, const ValueType& value,
+                      const BinaryPredicateType& pred) {
+  return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first,
+                             last, count, value, pred);
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType, class BinaryPredicateType>
+IteratorType search_n(const std::string& label, const ExecutionSpace& ex,
+                      IteratorType first, IteratorType last, SizeType count,
+                      const ValueType& value, const BinaryPredicateType& pred) {
+  return Impl::search_n_impl(label, ex, first, last, count, value, pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class ValueType, class BinaryPredicateType>
+auto search_n(const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              SizeType count, const ValueType& value,
+              const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex,
+                             KE::begin(view), KE::end(view), count, value,
+                             pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class SizeType, class ValueType, class BinaryPredicateType>
+auto search_n(const std::string& label, const ExecutionSpace& ex,
+              const ::Kokkos::View<DataType, Properties...>& view,
+              SizeType count, const ValueType& value,
+              const BinaryPredicateType& pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count,
+                             value, pred);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
new file mode 100644
index 000000000..cee111af9
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SHIFT_LEFT_HPP
+#define KOKKOS_STD_ALGORITHMS_SHIFT_LEFT_HPP
+
+#include "impl/Kokkos_ShiftLeft.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_left(const ExecutionSpace& ex, IteratorType first,
+                        IteratorType last,
+                        typename IteratorType::difference_type n) {
+  return Impl::shift_left_impl("Kokkos::shift_left_iterator_api_default", ex,
+                               first, last, n);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_left(const std::string& label, const ExecutionSpace& ex,
+                        IteratorType first, IteratorType last,
+                        typename IteratorType::difference_type n) {
+  return Impl::shift_left_impl(label, ex, first, last, n);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto shift_left(const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& view,
+                typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_left_impl("Kokkos::shift_left_view_api_default", ex,
+                               begin(view), end(view), n);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto shift_left(const std::string& label, const ExecutionSpace& ex,
+                const ::Kokkos::View<DataType, Properties...>& view,
+                typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_left_impl(label, ex, begin(view), end(view), n);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
new file mode 100644
index 000000000..f104d2bd7
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SHIFT_RIGHT_HPP
+#define KOKKOS_STD_ALGORITHMS_SHIFT_RIGHT_HPP
+
+#include "impl/Kokkos_ShiftRight.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_right(const ExecutionSpace& ex, IteratorType first,
+                         IteratorType last,
+                         typename IteratorType::difference_type n) {
+  return Impl::shift_right_impl("Kokkos::shift_right_iterator_api_default", ex,
+                                first, last, n);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_right(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last,
+                         typename IteratorType::difference_type n) {
+  return Impl::shift_right_impl(label, ex, first, last, n);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto shift_right(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& view,
+                 typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_right_impl("Kokkos::shift_right_view_api_default", ex,
+                                begin(view), end(view), n);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto shift_right(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType, Properties...>& view,
+                 typename decltype(begin(view))::difference_type n) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::shift_right_impl(label, ex, begin(view), end(view), n);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SortingOperations.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SortingOperations.hpp
deleted file mode 100644
index bcc38fb38..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SortingOperations.hpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STD_SORTING_OPERATIONS_HPP
-#define KOKKOS_STD_SORTING_OPERATIONS_HPP
-
-#include <Kokkos_Core.hpp>
-#include "Kokkos_BeginEnd.hpp"
-#include "Kokkos_Constraints.hpp"
-#include "Kokkos_NonModifyingSequenceOperations.hpp"
-#include "Kokkos_HelperPredicates.hpp"
-#include <string>
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-// ------------------
-//
-// functors
-//
-// ------------------
-
-template <class IteratorType, class IndicatorViewType, class ComparatorType>
-struct StdIsSortedUntilFunctor {
-  using index_type = typename IteratorType::difference_type;
-  IteratorType m_first;
-  IndicatorViewType m_indicator;
-  ComparatorType m_comparator;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, int& update, const bool final) const {
-    const auto& val_i   = m_first[i];
-    const auto& val_ip1 = m_first[i + 1];
-
-    if (m_comparator(val_ip1, val_i)) {
-      ++update;
-    }
-
-    if (final) {
-      m_indicator(i) = update;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdIsSortedUntilFunctor(IteratorType _first1, IndicatorViewType indicator,
-                          ComparatorType comparator)
-      : m_first(std::move(_first1)),
-        m_indicator(std::move(indicator)),
-        m_comparator(std::move(comparator)) {}
-};
-
-template <class IteratorType, class ComparatorType>
-struct StdIsSortedFunctor {
-  using index_type = typename IteratorType::difference_type;
-  IteratorType m_first;
-  ComparatorType m_comparator;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, std::size_t& update) const {
-    const auto& val_i   = m_first[i];
-    const auto& val_ip1 = m_first[i + 1];
-
-    if (m_comparator(val_ip1, val_i)) {
-      ++update;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdIsSortedFunctor(IteratorType _first1, ComparatorType comparator)
-      : m_first(std::move(_first1)), m_comparator(std::move(comparator)) {}
-};
-
-// ------------------------------------------
-// is_sorted_until_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-IteratorType is_sorted_until_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last, ComparatorType comp) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-
-  // trivial case
-  if (num_elements <= 1) {
-    return last;
-  }
-
-  /*
-    use scan and a helper "indicator" view
-    such that we scan the data and fill the indicator with
-    partial sum that is always 0 unless we find a pair that
-    breaks the sorting, so in that case the indicator will
-    have a 1 starting at the location where the sorting breaks.
-    So finding that 1 means finding the location we want.
-   */
-
-  // aliases
-  using indicator_value_type = std::size_t;
-  using indicator_view_type =
-      ::Kokkos::View<indicator_value_type*, ExecutionSpace>;
-  using functor_type =
-      StdIsSortedUntilFunctor<IteratorType, indicator_view_type,
-                              ComparatorType>;
-
-  // do scan
-  // use num_elements-1 because each index handles i and i+1
-  const auto num_elements_minus_one = num_elements - 1;
-  indicator_view_type indicator("is_sorted_until_indicator_helper",
-                                num_elements_minus_one);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_minus_one),
-      functor_type(first, indicator, std::move(comp)));
-
-  // try to find the first sentinel value, which indicates
-  // where the sorting condition breaks
-  namespace KE                                  = ::Kokkos::Experimental;
-  constexpr indicator_value_type sentinel_value = 1;
-  auto r =
-      KE::find(ex, KE::cbegin(indicator), KE::cend(indicator), sentinel_value);
-  const auto shift = r - ::Kokkos::Experimental::cbegin(indicator);
-
-  return first + (shift + 1);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType is_sorted_until_impl(const std::string& label,
-                                  const ExecutionSpace& ex, IteratorType first,
-                                  IteratorType last) {
-  using value_type = typename IteratorType::value_type;
-  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
-  return is_sorted_until_impl(label, ex, first, last, pred_t());
-}
-
-// ------------------------------------------
-// is_sorted_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last,
-                    ComparatorType comp) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  if (num_elements <= 1) {
-    return true;
-  }
-
-  // use num_elements-1 because each index handles i and i+1
-  const auto num_elements_minus_one = num_elements - 1;
-  using functor_type = StdIsSortedFunctor<IteratorType, ComparatorType>;
-
-  // result is incremented by one if sorting breaks at index i
-  std::size_t result = 0;
-  ::Kokkos::parallel_reduce(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_minus_one),
-      functor_type(first, std::move(comp)), result);
-
-  return result == 0;
-}
-
-template <class ExecutionSpace, class IteratorType>
-bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last) {
-  using value_type = typename IteratorType::value_type;
-  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
-  return is_sorted_impl(label, ex, first, last, pred_t());
-}
-
-}  // namespace Impl
-
-// ----------------------------------
-// is_sorted_until public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType>
-IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
-                             IteratorType last) {
-  return Impl::is_sorted_until_impl(
-      "Kokkos::is_sorted_until_iterator_api_default", ex, first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, IteratorType last) {
-  return Impl::is_sorted_until_impl(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto is_sorted_until(const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
-                                    ex, KE::begin(view), KE::end(view));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first,
-                             IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_until_impl(
-      "Kokkos::is_sorted_until_iterator_api_default", ex, first, last,
-      std::move(comp));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, IteratorType last,
-                             ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::is_sorted_until_impl(label, ex, first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
-auto is_sorted_until(const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& view,
-                     ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default",
-                                    ex, KE::begin(view), KE::end(view),
-                                    std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
-auto is_sorted_until(const std::string& label, const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType, Properties...>& view,
-                     ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view),
-                                    std::move(comp));
-}
-
-// ----------------------------------
-// is_sorted public API
-// ----------------------------------
-template <class ExecutionSpace, class IteratorType>
-bool is_sorted(const ExecutionSpace& ex, IteratorType first,
-               IteratorType last) {
-  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
-                              first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-bool is_sorted(const std::string& label, const ExecutionSpace& ex,
-               IteratorType first, IteratorType last) {
-  return Impl::is_sorted_impl(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-bool is_sorted(const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
-                              KE::cbegin(view), KE::cend(view));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-bool is_sorted(const std::string& label, const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last,
-               ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex,
-                              first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class IteratorType, class ComparatorType>
-bool is_sorted(const std::string& label, const ExecutionSpace& ex,
-               IteratorType first, IteratorType last, ComparatorType comp) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::is_sorted_impl(label, ex, first, last, std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
-bool is_sorted(const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view,
-               ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex,
-                              KE::cbegin(view), KE::cend(view),
-                              std::move(comp));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ComparatorType>
-bool is_sorted(const std::string& label, const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view,
-               ComparatorType comp) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view),
-                              std::move(comp));
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp
new file mode 100644
index 000000000..9006aa916
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP
+#define KOKKOS_STD_ALGORITHMS_SWAP_HPP
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+// swap
+template <class T>
+KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept {
+  static_assert(
+      std::is_move_assignable<T>::value && std::is_move_constructible<T>::value,
+      "Kokkos::Experimental::swap arguments must be move assignable "
+      "and move constructible");
+
+  T tmp = std::move(a);
+  a     = std::move(b);
+  b     = std::move(tmp);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
new file mode 100644
index 000000000..2997cdab4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp
@@ -0,0 +1,97 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SWAP_RANGES_HPP
+#define KOKKOS_STD_ALGORITHMS_SWAP_RANGES_HPP
+
+#include "impl/Kokkos_SwapRanges.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 swap_ranges(const ExecutionSpace& ex, IteratorType1 first1,
+                          IteratorType1 last1, IteratorType2 first2) {
+  return Impl::swap_ranges_impl("Kokkos::swap_ranges_iterator_api_default", ex,
+                                first1, last1, first2);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto swap_ranges(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  assert(source.extent(0) == dest.extent(0));
+  return Impl::swap_ranges_impl("Kokkos::swap_ranges_view_api_default", ex,
+                                begin(source), end(source), begin(dest));
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 swap_ranges(const std::string& label, const ExecutionSpace& ex,
+                          IteratorType1 first1, IteratorType1 last1,
+                          IteratorType2 first2) {
+  return Impl::swap_ranges_impl(label, ex, first1, last1, first2);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  assert(source.extent(0) == dest.extent(0));
+  return Impl::swap_ranges_impl(label, ex, begin(source), end(source),
+                                begin(dest));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
new file mode 100644
index 000000000..6dfb83a8c
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp
@@ -0,0 +1,166 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_HPP
+
+#include "impl/Kokkos_Transform.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class UnaryOperation>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      InputIterator, OutputIterator>::value,
+                  OutputIterator>
+transform(const ExecutionSpace& ex, InputIterator first1, InputIterator last1,
+          OutputIterator d_first, UnaryOperation unary_op) {
+  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
+                              first1, last1, d_first, std::move(unary_op));
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class UnaryOperation>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      InputIterator, OutputIterator>::value,
+                  OutputIterator>
+transform(const std::string& label, const ExecutionSpace& ex,
+          InputIterator first1, InputIterator last1, OutputIterator d_first,
+          UnaryOperation unary_op) {
+  return Impl::transform_impl(label, ex, first1, last1, d_first,
+                              std::move(unary_op));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class UnaryOperation>
+auto transform(const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType1, Properties1...>& source,
+               ::Kokkos::View<DataType2, Properties2...>& dest,
+               UnaryOperation unary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
+                              begin(source), end(source), begin(dest),
+                              std::move(unary_op));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class UnaryOperation>
+auto transform(const std::string& label, const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType1, Properties1...>& source,
+               ::Kokkos::View<DataType2, Properties2...>& dest,
+               UnaryOperation unary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_impl(label, ex, begin(source), end(source),
+                              begin(dest), std::move(unary_op));
+}
+
+template <class ExecutionSpace, class InputIterator1, class InputIterator2,
+          class OutputIterator, class BinaryOperation>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      InputIterator1, InputIterator2, OutputIterator>::value,
+                  OutputIterator>
+transform(const ExecutionSpace& ex, InputIterator1 first1, InputIterator1 last1,
+          InputIterator2 first2, OutputIterator d_first,
+          BinaryOperation binary_op) {
+  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
+                              first1, last1, first2, d_first,
+                              std::move(binary_op));
+}
+
+template <class ExecutionSpace, class InputIterator1, class InputIterator2,
+          class OutputIterator, class BinaryOperation>
+std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
+                      InputIterator1, InputIterator2, OutputIterator>::value,
+                  OutputIterator>
+transform(const std::string& label, const ExecutionSpace& ex,
+          InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
+          OutputIterator d_first, BinaryOperation binary_op) {
+  return Impl::transform_impl(label, ex, first1, last1, first2, d_first,
+                              std::move(binary_op));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class DataType3,
+          class... Properties3, class BinaryOperation>
+auto transform(const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType1, Properties1...>& source1,
+               const ::Kokkos::View<DataType2, Properties2...>& source2,
+               ::Kokkos::View<DataType3, Properties3...>& dest,
+               BinaryOperation binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
+                              begin(source1), end(source1), begin(source2),
+                              begin(dest), std::move(binary_op));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class DataType3,
+          class... Properties3, class BinaryOperation>
+auto transform(const std::string& label, const ExecutionSpace& ex,
+               const ::Kokkos::View<DataType1, Properties1...>& source1,
+               const ::Kokkos::View<DataType2, Properties2...>& source2,
+               ::Kokkos::View<DataType3, Properties3...>& dest,
+               BinaryOperation binary_op) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::transform_impl(label, ex, begin(source1), end(source1),
+                              begin(source2), begin(dest),
+                              std::move(binary_op));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
new file mode 100644
index 000000000..d0073599b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp
@@ -0,0 +1,131 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRASFORM_EXCLUSIVE_SCAN_HPP
+#define KOKKOS_STD_ALGORITHMS_TRASFORM_EXCLUSIVE_SCAN_HPP
+
+#include "impl/Kokkos_TransformExclusiveScan.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType,
+          class UnaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+                         InputIteratorType last, OutputIteratorType first_dest,
+                         ValueType init_value, BinaryOpType binary_op,
+                         UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::transform_exclusive_scan_impl(
+      "Kokkos::transform_exclusive_scan_custom_functors_iterator_api", ex,
+      first, last, first_dest, init_value, binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType,
+          class UnaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_exclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                         InputIteratorType first, InputIteratorType last,
+                         OutputIteratorType first_dest, ValueType init_value,
+                         BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  return Impl::transform_exclusive_scan_impl(label, ex, first, last, first_dest,
+                                             init_value, binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType,
+          class BinaryOpType, class UnaryOpType>
+auto transform_exclusive_scan(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_exclusive_scan_impl(
+      "Kokkos::transform_exclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      init_value, binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class ValueType,
+          class BinaryOpType, class UnaryOpType>
+auto transform_exclusive_scan(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  static_assert(std::is_move_constructible<ValueType>::value,
+                "ValueType must be move constructible.");
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_exclusive_scan_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), init_value, binary_op, unary_op);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
new file mode 100644
index 000000000..088e162ad
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_INCLUSIVE_SCAN_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_INCLUSIVE_SCAN_HPP
+
+#include "impl/Kokkos_TransformInclusiveScan.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set 1 (no init value)
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+                         InputIteratorType last, OutputIteratorType first_dest,
+                         BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::transform_inclusive_scan_impl(
+      "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
+      first, last, first_dest, binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                         InputIteratorType first, InputIteratorType last,
+                         OutputIteratorType first_dest, BinaryOpType binary_op,
+                         UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+
+  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
+                                             binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOpType,
+          class UnaryOpType>
+auto transform_inclusive_scan(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_impl(
+      "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      binary_op, unary_op);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOpType,
+          class UnaryOpType>
+auto transform_inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, unary_op);
+}
+
+// overload set 2 (init value)
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
+          class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
+                         InputIteratorType last, OutputIteratorType first_dest,
+                         BinaryOpType binary_op, UnaryOpType unary_op,
+                         ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  return Impl::transform_inclusive_scan_impl(
+      "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
+      first, last, first_dest, binary_op, unary_op, init_value);
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
+          class ValueType>
+std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators<
+                     InputIteratorType, OutputIteratorType>::value,
+                 OutputIteratorType>
+transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
+                         InputIteratorType first, InputIteratorType last,
+                         OutputIteratorType first_dest, BinaryOpType binary_op,
+                         UnaryOpType unary_op, ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
+                                             binary_op, unary_op, init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOpType,
+          class UnaryOpType, class ValueType>
+auto transform_inclusive_scan(
+    const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_impl(
+      "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
+      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
+      binary_op, unary_op, init_value);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryOpType,
+          class UnaryOpType, class ValueType>
+auto transform_inclusive_scan(
+    const std::string& label, const ExecutionSpace& ex,
+    const ::Kokkos::View<DataType1, Properties1...>& view_from,
+    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
+    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
+  namespace KE = ::Kokkos::Experimental;
+  return Impl::transform_inclusive_scan_impl(
+      label, ex, KE::cbegin(view_from), KE::cend(view_from),
+      KE::begin(view_dest), binary_op, unary_op, init_value);
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
similarity index 62%
rename from packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_TransformReduce.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
index 846166d32..5caced591 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_TransformReduce.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp
@@ -42,219 +42,14 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_NUMERICS_TRANSFORM_REDUCE_HPP
-#define KOKKOS_STD_NUMERICS_TRANSFORM_REDUCE_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_REDUCE_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_REDUCE_HPP
 
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_Distance.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp"
+#include "impl/Kokkos_TransformReduce.hpp"
+#include "Kokkos_BeginEnd.hpp"
 
 namespace Kokkos {
 namespace Experimental {
-namespace Impl {
-
-//
-// helper functors
-//
-template <class ValueType>
-struct StdTranformReduceDefaultBinaryTransformFunctor {
-  KOKKOS_FUNCTION
-  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
-    return (a * b);
-  }
-};
-
-template <class ValueType>
-struct StdTranformReduceDefaultJoinFunctor {
-  KOKKOS_FUNCTION
-  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
-    return a + b;
-  }
-
-  KOKKOS_FUNCTION
-  constexpr ValueType operator()(const volatile ValueType& a,
-                                 const volatile ValueType& b) const {
-    return a + b;
-  }
-};
-
-template <class IteratorType, class ReducerType, class TransformType>
-struct StdTransformReduceSingleIntervalFunctor {
-  using red_value_type = typename ReducerType::value_type;
-  using index_type     = typename IteratorType::difference_type;
-
-  const IteratorType m_first;
-  const ReducerType m_reducer;
-  const TransformType m_transform;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& red_value) const {
-    auto tmp_wrapped_value = red_value_type{m_transform(m_first[i]), false};
-    if (red_value.is_initial) {
-      red_value = tmp_wrapped_value;
-    } else {
-      m_reducer.join(red_value, tmp_wrapped_value);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdTransformReduceSingleIntervalFunctor(IteratorType first,
-                                          ReducerType reducer,
-                                          TransformType transform)
-      : m_first(std::move(first)),
-        m_reducer(std::move(reducer)),
-        m_transform(std::move(transform)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2,
-          class ReducerType, class TransformType>
-struct StdTransformReduceTwoIntervalsFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  const IteratorType1 m_first1;
-  const IteratorType2 m_first2;
-  const ReducerType m_reducer;
-  const TransformType m_transform;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    auto tmp_wrapped_value =
-        red_value_type{m_transform(m_first1[i], m_first2[i]), false};
-
-    if (red_value.is_initial) {
-      red_value = tmp_wrapped_value;
-    } else {
-      m_reducer.join(red_value, tmp_wrapped_value);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdTransformReduceTwoIntervalsFunctor(IteratorType1 first1,
-                                        IteratorType2 first2,
-                                        ReducerType reducer,
-                                        TransformType transform)
-      : m_first1(std::move(first1)),
-        m_first2(std::move(first2)),
-        m_reducer(std::move(reducer)),
-        m_transform(std::move(transform)) {}
-};
-
-//------------------------------
-//
-// impl functions
-//
-//------------------------------
-
-template <class ExecutionSpace, class IteratorType, class ValueType,
-          class JoinerType, class UnaryTransformerType>
-ValueType transform_reduce_custom_functors_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType first,
-    IteratorType last, ValueType init_reduction_value, JoinerType joiner,
-    UnaryTransformerType transformer) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    // init is returned, unmodified
-    return init_reduction_value;
-  }
-
-  // aliases
-  using reducer_type =
-      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
-  using functor_type =
-      StdTransformReduceSingleIntervalFunctor<IteratorType, reducer_type,
-                                              UnaryTransformerType>;
-  using reduction_value_type = typename reducer_type::value_type;
-
-  // run
-  reduction_value_type result;
-  reducer_type reducer(result, joiner);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            functor_type(first, reducer, transformer), reducer);
-
-  // fence not needed since reducing into scalar
-
-  // as per standard, transform is not applied to the init value
-  // https://en.cppreference.com/w/cpp/algorithm/transform_reduce
-  return joiner(result.val, init_reduction_value);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType, class JoinerType, class BinaryTransformerType>
-ValueType transform_reduce_custom_functors_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value,
-    JoinerType joiner, BinaryTransformerType transformer) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-
-  if (first1 == last1) {
-    // init is returned, unmodified
-    return init_reduction_value;
-  }
-
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using reducer_type =
-      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
-  using functor_type =
-      StdTransformReduceTwoIntervalsFunctor<index_type, IteratorType1,
-                                            IteratorType2, reducer_type,
-                                            BinaryTransformerType>;
-  using reduction_value_type = typename reducer_type::value_type;
-
-  // run
-  reduction_value_type result;
-  reducer_type reducer(result, joiner);
-
-  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
-  ::Kokkos::parallel_reduce(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      functor_type(first1, first2, reducer, transformer), reducer);
-
-  // fence not needed since reducing into scalar
-  return joiner(result.val, init_reduction_value);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2,
-          class ValueType>
-ValueType transform_reduce_default_functors_impl(
-    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
-    IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-
-  // aliases
-  using transformer_type =
-      Impl::StdTranformReduceDefaultBinaryTransformFunctor<ValueType>;
-  using joiner_type = Impl::StdTranformReduceDefaultJoinFunctor<ValueType>;
-
-  return transform_reduce_custom_functors_impl(
-      label, ex, first1, last1, first2, std::move(init_reduction_value),
-      joiner_type(), transformer_type());
-}
-
-}  // end namespace Impl
-
-///////////////////////////////
-//
-// transform_reduce public API
-//
-///////////////////////////////
 
 // ----------------------------
 // overload set1:
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
new file mode 100644
index 000000000..aeb54a6ff
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Unique.hpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_UNIQUE_HPP
+#define KOKKOS_STD_ALGORITHMS_UNIQUE_HPP
+
+#include "impl/Kokkos_Unique.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// note: the enable_if below is to avoid "call to ... is ambiguous"
+// for example in the unit test when using a variadic function
+
+// overload set1
+template <class ExecutionSpace, class IteratorType>
+std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
+    const ExecutionSpace& ex, IteratorType first, IteratorType last) {
+  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
+                           last);
+}
+
+template <class ExecutionSpace, class IteratorType>
+std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last) {
+  return Impl::unique_impl(label, ex, first, last);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto unique(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return ::Kokkos::Experimental::unique("Kokkos::unique_view_api_default", ex,
+                                        begin(view), end(view));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties>
+auto unique(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return ::Kokkos::Experimental::unique(label, ex, begin(view), end(view));
+}
+
+// overload set2
+template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
+IteratorType unique(const ExecutionSpace& ex, IteratorType first,
+                    IteratorType last, BinaryPredicate pred) {
+  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
+                           last, pred);
+}
+
+template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
+IteratorType unique(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last,
+                    BinaryPredicate pred) {
+  return Impl::unique_impl(label, ex, first, last, pred);
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class BinaryPredicate>
+auto unique(const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            BinaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::unique_impl("Kokkos::unique_view_api_default", ex, begin(view),
+                           end(view), std::move(pred));
+}
+
+template <class ExecutionSpace, class DataType, class... Properties,
+          class BinaryPredicate>
+auto unique(const std::string& label, const ExecutionSpace& ex,
+            const ::Kokkos::View<DataType, Properties...>& view,
+            BinaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
+  return Impl::unique_impl(label, ex, begin(view), end(view), std::move(pred));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
new file mode 100644
index 000000000..632b560fa
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp
@@ -0,0 +1,143 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_UNIQUE_COPY_HPP
+#define KOKKOS_STD_ALGORITHMS_UNIQUE_COPY_HPP
+
+#include "impl/Kokkos_UniqueCopy.hpp"
+#include "Kokkos_BeginEnd.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload set1
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
+unique_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last,
+            OutputIterator d_first) {
+  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
+                                first, last, d_first);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
+unique_copy(const std::string& label, const ExecutionSpace& ex,
+            InputIterator first, InputIterator last, OutputIterator d_first) {
+  return Impl::unique_copy_impl(label, ex, first, last, d_first);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto unique_copy(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return ::Kokkos::Experimental::unique_copy(
+      "Kokkos::unique_copy_view_api_default", ex, cbegin(source), cend(source),
+      begin(dest));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2>
+auto unique_copy(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return ::Kokkos::Experimental::unique_copy(label, ex, cbegin(source),
+                                             cend(source), begin(dest));
+}
+
+// overload set2
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class BinaryPredicate>
+OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first,
+                           InputIterator last, OutputIterator d_first,
+                           BinaryPredicate pred) {
+  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
+                                first, last, d_first, pred);
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class BinaryPredicate>
+OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first, InputIterator last,
+                           OutputIterator d_first, BinaryPredicate pred) {
+  return Impl::unique_copy_impl(label, ex, first, last, d_first, pred);
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicate>
+auto unique_copy(const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest,
+                 BinaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::unique_copy_impl("Kokkos::unique_copy_view_api_default", ex,
+                                cbegin(source), cend(source), begin(dest),
+                                std::move(pred));
+}
+
+template <class ExecutionSpace, class DataType1, class... Properties1,
+          class DataType2, class... Properties2, class BinaryPredicate>
+auto unique_copy(const std::string& label, const ExecutionSpace& ex,
+                 const ::Kokkos::View<DataType1, Properties1...>& source,
+                 const ::Kokkos::View<DataType2, Properties2...>& dest,
+                 BinaryPredicate pred) {
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
+  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
+
+  return Impl::unique_copy_impl(label, ex, cbegin(source), cend(source),
+                                begin(dest), std::move(pred));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
new file mode 100644
index 000000000..35c78b86b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp
@@ -0,0 +1,135 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ADJACENT_DIFFERENCE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_ADJACENT_DIFFERENCE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ValueType1, class ValueType2, class RetType = ValueType2>
+struct StdAdjacentDifferenceDefaultBinaryOpFunctor {
+  KOKKOS_FUNCTION
+  constexpr RetType operator()(const ValueType1& a, const ValueType2& b) const {
+    return a - b;
+  }
+};
+
+template <class InputIteratorType, class OutputIteratorType,
+          class BinaryOperator>
+struct StdAdjacentDiffFunctor {
+  using index_type = typename InputIteratorType::difference_type;
+
+  const InputIteratorType m_first_from;
+  const OutputIteratorType m_first_dest;
+  BinaryOperator m_op;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i) const {
+    const auto& my_value = m_first_from[i];
+    if (i == 0) {
+      m_first_dest[i] = my_value;
+    } else {
+      const auto& left_value = m_first_from[i - 1];
+      m_first_dest[i]        = m_op(my_value, left_value);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdAdjacentDiffFunctor(InputIteratorType first_from,
+                         OutputIteratorType first_dest, BinaryOperator op)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_op(std::move(op)) {}
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOp>
+OutputIteratorType adjacent_difference_impl(const std::string& label,
+                                            const ExecutionSpace& ex,
+                                            InputIteratorType first_from,
+                                            InputIteratorType last_from,
+                                            OutputIteratorType first_dest,
+                                            BinaryOp bin_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  if (first_from == last_from) {
+    return first_dest;
+  }
+
+  // aliases
+  using value_type    = typename OutputIteratorType::value_type;
+  using aux_view_type = ::Kokkos::View<value_type*, ExecutionSpace>;
+  using functor_t =
+      StdAdjacentDiffFunctor<InputIteratorType, OutputIteratorType, BinaryOp>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  aux_view_type aux_view("aux_view", num_elements);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         functor_t(first_from, first_dest, bin_op));
+  ex.fence("Kokkos::adjacent_difference: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
new file mode 100644
index 000000000..155f6c7bb
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ADJACENT_FIND_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_ADJACENT_FIND_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType, class ReducerType,
+          class PredicateType>
+struct StdAdjacentFindFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType m_first;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    const auto& my_value   = m_first[i];
+    const auto& next_value = m_first[i + 1];
+    const bool are_equal   = m_p(my_value, next_value);
+
+    auto rv =
+        are_equal
+            ? red_value_type{i}
+            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdAdjacentFindFunctor(IteratorType first, ReducerType reducer,
+                         PredicateType p)
+      : m_first(std::move(first)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+IteratorType adjacent_find_impl(const std::string& label,
+                                const ExecutionSpace& ex, IteratorType first,
+                                IteratorType last, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+
+  if (num_elements <= 1) {
+    return last;
+  }
+
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdAdjacentFindFunctor<index_type, IteratorType, reducer_type,
+                                        PredicateType>;
+
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+
+  // note that we use below num_elements-1 because
+  // each index i in the reduction checks i and (i+1).
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements - 1),
+      func_t(first, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    return last;
+  } else {
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType adjacent_find_impl(const std::string& label,
+                                const ExecutionSpace& ex, IteratorType first,
+                                IteratorType last) {
+  using value_type     = typename IteratorType::value_type;
+  using default_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
+  return adjacent_find_impl(label, ex, first, last, default_pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
new file mode 100644
index 000000000..dd8ae4f5b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp
@@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ALL_OF_ANY_OF_NONE_OF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_ALL_OF_ANY_OF_NONE_OF_IMPL_HPP
+
+#include "Kokkos_FindIfOrNot.hpp"
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool all_of_impl(const std::string& label, const ExecutionSpace& ex,
+                 InputIterator first, InputIterator last, Predicate predicate) {
+  return (find_if_or_not_impl<false>(label, ex, first, last, predicate) ==
+          last);
+}
+
+template <class ExecutionSpace, class InputIterator, class Predicate>
+bool any_of_impl(const std::string& label, const ExecutionSpace& ex,
+                 InputIterator first, InputIterator last, Predicate predicate) {
+  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) != last);
+}
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+bool none_of_impl(const std::string& label, const ExecutionSpace& ex,
+                  IteratorType first, IteratorType last, Predicate predicate) {
+  return (find_if_or_not_impl<true>(label, ex, first, last, predicate) == last);
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Constraints.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp
similarity index 100%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_Constraints.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
new file mode 100644
index 000000000..18f614094
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_BACKWARD_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_BACKWARD_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2>
+struct StdCopyBackwardFunctor {
+  static_assert(std::is_signed<IndexType>::value,
+                "Kokkos: StdCopyBackwardFunctor requires signed index type");
+
+  IteratorType1 m_last;
+  IteratorType2 m_dest_last;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const { m_dest_last[-i - 1] = m_last[-i - 1]; }
+
+  KOKKOS_FUNCTION
+  StdCopyBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
+      : m_last(std::move(_last)), m_dest_last(std::move(_dest_last)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 copy_backward_impl(const std::string& label,
+                                 const ExecutionSpace& ex, IteratorType1 first,
+                                 IteratorType1 last, IteratorType2 d_last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_last);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using func_t =
+      StdCopyBackwardFunctor<index_type, IteratorType1, IteratorType2>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(last, d_last));
+  ex.fence("Kokkos::copy_backward: fence after operation");
+
+  // return
+  return d_last - num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
new file mode 100644
index 000000000..03b6fc6ec
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator>
+struct StdCopyFunctor {
+  InputIterator m_first;
+  OutputIterator m_dest_first;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const { m_dest_first[i] = m_first[i]; }
+
+  KOKKOS_FUNCTION
+  StdCopyFunctor(InputIterator _first, OutputIterator _dest_first)
+      : m_first(std::move(_first)), m_dest_first(std::move(_dest_first)) {}
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex,
+                         InputIterator first, InputIterator last,
+                         OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_t     = StdCopyFunctor<index_type, InputIterator, OutputIterator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first, d_first));
+  ex.fence("Kokkos::copy: fence after operation");
+
+  // return
+  return d_first + num_elements;
+}
+
+template <class ExecutionSpace, class InputIterator, class Size,
+          class OutputIterator>
+OutputIterator copy_n_impl(const std::string& label, const ExecutionSpace& ex,
+                           InputIterator first_from, Size count,
+                           OutputIterator first_dest) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+
+  if (count > 0) {
+    return copy_impl(label, ex, first_from, first_from + count, first_dest);
+  } else {
+    return first_dest;
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
new file mode 100644
index 000000000..aebb5a9a4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp
@@ -0,0 +1,142 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COPY_IF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_COPY_IF_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class FirstFrom, class FirstDest, class PredType>
+struct StdCopyIfFunctor {
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  PredType m_pred;
+
+  KOKKOS_FUNCTION
+  StdCopyIfFunctor(FirstFrom first_from, FirstDest first_dest, PredType pred)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_pred(std::move(pred)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, IndexType& update,
+                  const bool final_pass) const {
+    const auto& myval = m_first_from[i];
+    if (final_pass) {
+      if (m_pred(myval)) {
+        m_first_dest[update] = myval;
+      }
+    }
+
+    if (m_pred(myval)) {
+      update += 1;
+    }
+  }
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class PredicateType>
+OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex,
+                            InputIterator first, InputIterator last,
+                            OutputIterator d_first, PredicateType pred) {
+  /*
+    To explain the impl, suppose that our data is:
+
+    | 1 | 1 | 2 | 2 | 3 | -2 | 4 | 4 | 4 | 5 | 7 | -10 |
+
+    and we want to copy only the even entries,
+    We can use an exclusive scan where the "update"
+    is incremented only for the elements that satisfy the predicate.
+    This way, the update allows us to track where in the destination
+    we need to copy the elements:
+
+    In this case, counting only the even entries, the exlusive scan
+    during the final pass would yield:
+
+    | 0 | 0 | 0 | 1 | 2 | 2 | 3 | 4 | 5 | 6 | 6 | 6 |
+              *   *       *   *   *   *           *
+
+    which provides the indexing in the destination where
+    each starred (*) element needs to be copied to since
+    the starred elements are those that satisfy the predicate.
+   */
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return d_first;
+  } else {
+    // aliases
+    using index_type = typename InputIterator::difference_type;
+    using func_type  = StdCopyIfFunctor<index_type, InputIterator,
+                                       OutputIterator, PredicateType>;
+
+    // run
+    const auto num_elements = Kokkos::Experimental::distance(first, last);
+    index_type count        = 0;
+    ::Kokkos::parallel_scan(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_type(first, d_first, pred), count);
+
+    // fence not needed because of the scan accumulating into count
+    return d_first + count;
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
new file mode 100644
index 000000000..982ac4046
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_COUNT_IF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_COUNT_IF_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class Predicate>
+struct StdCountIfFunctor {
+  using index_type = typename IteratorType::difference_type;
+  IteratorType m_first;
+  Predicate m_predicate;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i, index_type& lsum) const {
+    if (m_predicate(m_first[i])) {
+      lsum++;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdCountIfFunctor(IteratorType _first, Predicate _predicate)
+      : m_first(std::move(_first)), m_predicate(std::move(_predicate)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class Predicate>
+typename IteratorType::difference_type count_if_impl(const std::string& label,
+                                                     const ExecutionSpace& ex,
+                                                     IteratorType first,
+                                                     IteratorType last,
+                                                     Predicate predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using func_t = StdCountIfFunctor<IteratorType, Predicate>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  typename IteratorType::difference_type count = 0;
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, predicate), count);
+  ex.fence("Kokkos::count_if: fence after operation");
+
+  return count;
+}
+
+template <class ExecutionSpace, class IteratorType, class T>
+auto count_impl(const std::string& label, const ExecutionSpace& ex,
+                IteratorType first, IteratorType last, const T& value) {
+  return count_if_impl(
+      label, ex, first, last,
+      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
new file mode 100644
index 000000000..9482917ab
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_EQUAL_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_EQUAL_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+struct StdEqualFunctor {
+  IteratorType1 m_first1;
+  IteratorType2 m_first2;
+  BinaryPredicateType m_predicate;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i, std::size_t& lsum) const {
+    if (!m_predicate(m_first1[i], m_first2[i])) {
+      lsum = 1;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdEqualFunctor(IteratorType1 _first1, IteratorType2 _first2,
+                  BinaryPredicateType _predicate)
+      : m_first1(std::move(_first1)),
+        m_first2(std::move(_first2)),
+        m_predicate(std::move(_predicate)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+bool equal_impl(const std::string& label, const ExecutionSpace& ex,
+                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+                BinaryPredicateType predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using func_t     = StdEqualFunctor<index_type, IteratorType1, IteratorType2,
+                                 BinaryPredicateType>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  std::size_t different   = 0;
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first1, first2, predicate), different);
+  ex.fence("Kokkos::equal: fence after operation");
+
+  return !different;
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+bool equal_impl(const std::string& label, const ExecutionSpace& ex,
+                IteratorType1 first1, IteratorType1 last1,
+                IteratorType2 first2) {
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return equal_impl(label, ex, first1, last1, first2, pred_t());
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+bool equal_impl(const std::string& label, const ExecutionSpace& ex,
+                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+                IteratorType2 last2, BinaryPredicateType predicate) {
+  const auto d1 = ::Kokkos::Experimental::distance(first1, last1);
+  const auto d2 = ::Kokkos::Experimental::distance(first2, last2);
+  if (d1 != d2) {
+    return false;
+  }
+
+  return equal_impl(label, ex, first1, last1, first2, predicate);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+bool equal_impl(const std::string& label, const ExecutionSpace& ex,
+                IteratorType1 first1, IteratorType1 last1, IteratorType2 first2,
+                IteratorType2 last2) {
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return equal_impl(label, ex, first1, last1, first2, last2, pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
new file mode 100644
index 000000000..0ae4651c6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp
@@ -0,0 +1,232 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_EXCLUSIVE_SCAN_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_EXCLUSIVE_SCAN_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
+#include "Kokkos_IdentityReferenceUnaryFunctor.hpp"
+#include <std_algorithms/Kokkos_TransformExclusiveScan.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct ExclusiveScanDefaultFunctorForKnownNeutralElement {
+  using execution_space = ExeSpace;
+
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init,
+                                                    FirstFrom first_from,
+                                                    FirstDest first_dest)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, ValueType& update,
+                  const bool final_pass) const {
+    if (final_pass) m_first_dest[i] = update + m_init_value;
+    update += m_first_from[i];
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct ExclusiveScanDefaultFunctor {
+  using execution_space = ExeSpace;
+  using value_type =
+      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
+
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  ExclusiveScanDefaultFunctor(ValueType init, FirstFrom first_from,
+                              FirstDest first_dest)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    if (final_pass) {
+      if (i == 0) {
+        m_first_dest[i] = m_init_value;
+      } else {
+        m_first_dest[i] = update.val + m_init_value;
+      }
+    }
+
+    const auto tmp = value_type{m_first_from[i], false};
+    this->join(update, tmp);
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (update.is_initial) {
+      update.val        = input.val;
+      update.is_initial = false;
+    } else {
+      update.val = update.val + input.val;
+    }
+  }
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType>
+OutputIteratorType exclusive_scan_custom_op_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type    = typename InputIteratorType::difference_type;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
+  using func_type =
+      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
+                                    InputIteratorType, OutputIteratorType,
+                                    BinaryOpType, unary_op_type>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(init_value, first_from, first_dest, bop, unary_op_type()));
+  ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+template <typename ValueType>
+using ex_scan_has_reduction_identity_sum_t =
+    decltype(Kokkos::reduction_identity<ValueType>::sum());
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+OutputIteratorType exclusive_scan_default_op_impl(const std::string& label,
+                                                  const ExecutionSpace& ex,
+                                                  InputIteratorType first_from,
+                                                  InputIteratorType last_from,
+                                                  OutputIteratorType first_dest,
+                                                  ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // does it make sense to do this static_assert too?
+  // using input_iterator_value_type = typename InputIteratorType::value_type;
+  // static_assert
+  //   (std::is_convertible<std::remove_cv_t<input_iterator_value_type>,
+  //   ValueType>::value,
+  //    "exclusive_scan: InputIteratorType::value_type not convertible to
+  //    ValueType");
+
+  // we are unnecessarily duplicating code, but this is on purpose
+  // so that we can use the default_op for OpenMPTarget.
+  // Originally, I had this implemented as:
+  // '''
+  // using bop_type   = StdExclusiveScanDefaultJoinFunctor<ValueType>;
+  // call exclusive_scan_custom_op_impl(..., bop_type());
+  // '''
+  // which avoids duplicating the functors, but for OpenMPTarget
+  // I cannot use a custom binary op.
+  // This is the same problem that occurs for reductions.
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type  = std::conditional_t<
+      ::Kokkos::is_detected<ex_scan_has_reduction_identity_sum_t,
+                            ValueType>::value,
+      ExclusiveScanDefaultFunctorForKnownNeutralElement<
+          ExecutionSpace, index_type, ValueType, InputIteratorType,
+          OutputIteratorType>,
+      ExclusiveScanDefaultFunctor<ExecutionSpace, index_type, ValueType,
+                                  InputIteratorType, OutputIteratorType>>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(label,
+                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                          func_type(init_value, first_from, first_dest));
+
+  ex.fence("Kokkos::exclusive_scan_default_op: fence after operation");
+
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
similarity index 52%
rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
index bd29a0b18..843771b6b 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp
@@ -42,83 +42,65 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_THREADS)
+#ifndef KOKKOS_STD_ALGORITHMS_FILL_AND_FILL_N_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FILL_AND_FILL_N_IMPL_HPP
 
-#include <Kokkos_Core_fwd.hpp>
-
-/* Standard C++ libraries */
-
-#include <cstdlib>
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
 #include <string>
-#include <iostream>
-#include <stdexcept>
-#include <thread>
-#include <mutex>
-
-#include <Kokkos_Threads.hpp>
-
-//----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
 namespace Impl {
-namespace {
-
-std::mutex host_internal_cppthread_mutex;
-
-// std::thread compatible driver.
-// Recovery from an exception would require constant intra-thread health
-// verification; which would negatively impact runtime.  As such simply
-// abort the process.
-
-void internal_cppthread_driver() {
-  try {
-    ThreadsExec::driver();
-  } catch (const std::exception& x) {
-    std::cerr << "Exception thrown from worker thread: " << x.what()
-              << std::endl;
-    std::cerr.flush();
-    std::abort();
-  } catch (...) {
-    std::cerr << "Exception thrown from worker thread" << std::endl;
-    std::cerr.flush();
-    std::abort();
-  }
-}
 
-}  // namespace
-
-//----------------------------------------------------------------------------
-// Spawn a thread
-
-void ThreadsExec::spawn() {
-  std::thread t(internal_cppthread_driver);
-  t.detach();
+template <class InputIterator, class T>
+struct StdFillFunctor {
+  using index_type = typename InputIterator::difference_type;
+  InputIterator m_first;
+  T m_value;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const { m_first[i] = m_value; }
+
+  KOKKOS_FUNCTION
+  StdFillFunctor(InputIterator _first, T _value)
+      : m_first(std::move(_first)), m_value(std::move(_value)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class T>
+void fill_impl(const std::string& label, const ExecutionSpace& ex,
+               IteratorType first, IteratorType last, const T& value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         StdFillFunctor<IteratorType, T>(first, value));
+  ex.fence("Kokkos::fill: fence after operation");
 }
 
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::is_process() {
-  static const std::thread::id master_pid = std::this_thread::get_id();
-
-  return master_pid == std::this_thread::get_id();
-}
+template <class ExecutionSpace, class IteratorType, class SizeType, class T>
+IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, SizeType n, const T& value) {
+  auto last = first + n;
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
 
-void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); }
-
-void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); }
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::wait_yield(volatile int& flag, const int value) {
-  while (value == flag) {
-    std::this_thread::yield();
+  if (n <= 0) {
+    return first;
   }
+
+  fill_impl(label, ex, first, last, value);
+  return last;
 }
 
 }  // namespace Impl
+}  // namespace Experimental
 }  // namespace Kokkos
 
-#else
-void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
-#endif /* end #if defined( KOKKOS_ENABLE_THREADS ) */
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
new file mode 100644
index 000000000..35a6c4b4a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp
@@ -0,0 +1,191 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_END_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_END_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class PredicateType>
+struct StdFindEndFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType1 m_first;
+  IteratorType1 m_last;
+  IteratorType2 m_s_first;
+  IteratorType2 m_s_last;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    namespace KE = ::Kokkos::Experimental;
+    auto myit    = m_first + i;
+    bool found   = true;
+
+    const auto search_count = KE::distance(m_s_first, m_s_last);
+    for (IndexType k = 0; k < search_count; ++k) {
+      // note that we add this EXPECT to check if we are in a valid range
+      // but I think we can remvoe this beceause the guarantee we don't go
+      // out of bounds is taken care of at the calling site
+      // where we launch the par-reduce.
+      KOKKOS_EXPECTS((myit + k) < m_last);
+
+      if (!m_p(myit[k], m_s_first[k])) {
+        found = false;
+        break;
+      }
+    }
+
+    const auto rv =
+        found ? red_value_type{i}
+              : red_value_type{::Kokkos::reduction_identity<IndexType>::max()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdFindEndFunctor(IteratorType1 first, IteratorType1 last,
+                    IteratorType2 s_first, IteratorType2 s_last,
+                    ReducerType reducer, PredicateType p)
+      : m_first(std::move(first)),
+        m_last(std::move(last)),
+        m_s_first(std::move(s_first)),
+        m_s_last(std::move(s_last)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 s_first, IteratorType2 s_last,
+                            const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  // the target sequence should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  const auto s_count      = KE::distance(s_first, s_last);
+  KOKKOS_EXPECTS(num_elements >= s_count);
+  (void)s_count;  // needed when macro above is a no-op
+
+  if (s_first == s_last) {
+    return last;
+  }
+
+  if (first == last) {
+    return last;
+  }
+
+  // special case where the two ranges have equal size
+  if (num_elements == s_count) {
+    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
+    return (equal_result) ? first : last;
+  } else {
+    using index_type           = typename IteratorType1::difference_type;
+    using reducer_type         = LastLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t = StdFindEndFunctor<index_type, IteratorType1, IteratorType2,
+                                     reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // note that the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to the sequence count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - s_count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(
+        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
+        func_t(first, last, s_first, s_last, reducer, pred), reducer);
+
+    // fence not needed because reducing into scalar
+
+    // decide and return
+    if (red_result.max_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::max()) {
+      // if here, a subrange has not been found
+      return last;
+    } else {
+      // a location has been found
+      return first + red_result.max_loc_true;
+    }
+  }
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType1 first, IteratorType1 last,
+                            IteratorType2 s_first, IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return find_end_impl(label, ex, first, last, s_first, s_last,
+                       predicate_type());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
new file mode 100644
index 000000000..6907bbdbc
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp
@@ -0,0 +1,161 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_FIRST_OF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_FIRST_OF_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class PredicateType>
+struct StdFindFirstOfFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType1 m_first;
+  IteratorType2 m_s_first;
+  IteratorType2 m_s_last;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    namespace KE        = ::Kokkos::Experimental;
+    const auto& myvalue = m_first[i];
+    bool found          = false;
+
+    const auto search_count = KE::distance(m_s_first, m_s_last);
+    for (IndexType k = 0; k < search_count; ++k) {
+      if (m_p(myvalue, m_s_first[k])) {
+        found = true;
+        break;
+      }
+    }
+
+    const auto rv =
+        found ? red_value_type{i}
+              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdFindFirstOfFunctor(IteratorType1 first, IteratorType2 s_first,
+                        IteratorType2 s_last, ReducerType reducer,
+                        PredicateType p)
+      : m_first(std::move(first)),
+        m_s_first(std::move(s_first)),
+        m_s_last(std::move(s_last)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 find_first_of_impl(const std::string& label,
+                                 const ExecutionSpace& ex, IteratorType1 first,
+                                 IteratorType1 last, IteratorType2 s_first,
+                                 IteratorType2 s_last,
+                                 const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  if ((s_first == s_last) || (first == last)) {
+    return last;
+  }
+
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdFindFirstOfFunctor<index_type, IteratorType1, IteratorType2,
+                                       reducer_type, BinaryPredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_t(first, s_first, s_last, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // if here, nothing found
+    return last;
+  } else {
+    // a location has been found
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 find_first_of_impl(const std::string& label,
+                                 const ExecutionSpace& ex, IteratorType1 first,
+                                 IteratorType1 last, IteratorType2 s_first,
+                                 IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return find_first_of_impl(label, ex, first, last, s_first, s_last,
+                            predicate_type());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
new file mode 100644
index 000000000..c79c4b521
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp
@@ -0,0 +1,146 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FIND_IF_AND_FIND_IF_NOT_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FIND_IF_AND_FIND_IF_NOT_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <bool is_find_if, class IndexType, class IteratorType,
+          class ReducerType, class PredicateType>
+struct StdFindIfOrNotFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType m_first;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    const auto& my_value = m_first[i];
+
+    // if doing find_if, look for when predicate is true
+    // if doing find_if_not, look for when predicate is false
+    const bool found_condition = is_find_if ? m_p(my_value) : !m_p(my_value);
+
+    auto rv =
+        found_condition
+            ? red_value_type{i}
+            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdFindIfOrNotFunctor(IteratorType first, ReducerType reducer,
+                        PredicateType p)
+      : m_first(std::move(first)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <bool is_find_if, class ExecutionSpace, class IteratorType,
+          class PredicateType>
+IteratorType find_if_or_not_impl(const std::string& label,
+                                 const ExecutionSpace& ex, IteratorType first,
+                                 IteratorType last, PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(
+      ex, first);  // only need one It per type
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdFindIfOrNotFunctor<is_find_if, index_type, IteratorType,
+                                       reducer_type, PredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // here, it means a valid loc has not been found,
+    return last;
+  } else {
+    // a location has been found
+    return first + red_result.min_loc_true;
+  }
+}
+
+template <class ExecutionSpace, class InputIterator, class T>
+InputIterator find_impl(const std::string& label, ExecutionSpace ex,
+                        InputIterator first, InputIterator last,
+                        const T& value) {
+  return find_if_or_not_impl<true>(
+      label, ex, first, last,
+      ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate<T>(value));
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
new file mode 100644
index 000000000..8bd37b13b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_FOR_EACH_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_FOR_EACH_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class UnaryFunctorType>
+struct StdForEachFunctor {
+  using index_type = typename IteratorType::difference_type;
+  IteratorType m_first;
+  UnaryFunctorType m_functor;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const { m_functor(m_first[i]); }
+
+  KOKKOS_FUNCTION
+  StdForEachFunctor(IteratorType _first, UnaryFunctorType _functor)
+      : m_first(std::move(_first)), m_functor(std::move(_functor)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class UnaryFunctorType>
+UnaryFunctorType for_each_impl(const std::string& label,
+                               const ExecutionSpace& ex, IteratorType first,
+                               IteratorType last, UnaryFunctorType functor) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor));
+  ex.fence("Kokkos::for_each: fence after operation");
+
+  return functor;
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class UnaryFunctorType>
+IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, SizeType n,
+                             UnaryFunctorType functor) {
+  auto last = first + n;
+  Impl::static_assert_random_access_and_accessible(ex, first, last);
+  Impl::expect_valid_range(first, last);
+
+  if (n == 0) {
+    return first;
+  }
+
+  for_each_impl(label, ex, first, last, std::move(functor));
+  // no neeed to fence since for_each_impl fences already
+
+  return last;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
new file mode 100644
index 000000000..f01d9bfb5
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp
@@ -0,0 +1,105 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_GENERATE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_GENERATE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class Generator>
+struct StdGenerateFunctor {
+  using index_type = typename IteratorType::difference_type;
+  IteratorType m_first;
+  Generator m_generator;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const { m_first[i] = m_generator(); }
+
+  KOKKOS_FUNCTION
+  StdGenerateFunctor(IteratorType _first, Generator _g)
+      : m_first(std::move(_first)), m_generator(std::move(_g)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class Generator>
+void generate_impl(const std::string& label, const ExecutionSpace& ex,
+                   IteratorType first, IteratorType last, Generator g) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using func_t = StdGenerateFunctor<IteratorType, Generator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first, g));
+  ex.fence("Kokkos::generate: fence after operation");
+}
+
+template <class ExecutionSpace, class IteratorType, class Size, class Generator>
+IteratorType generate_n_impl(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, Size count, Generator g) {
+  if (count <= 0) {
+    return first;
+  }
+
+  generate_impl(label, ex, first, first + count, g);
+  return first + count;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_HelperPredicates.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_HelperPredicates.hpp
similarity index 97%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_HelperPredicates.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_HelperPredicates.hpp
index 18d5dadd5..244bce48e 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_HelperPredicates.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_HelperPredicates.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_HELPER_PREDICATES_HPP
-#define KOKKOS_STD_HELPER_PREDICATES_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_HELPER_PREDICATES_HPP
+#define KOKKOS_STD_ALGORITHMS_HELPER_PREDICATES_HPP
 
 #include <Kokkos_Macros.hpp>
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_IdentityReferenceUnaryFunctor.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IdentityReferenceUnaryFunctor.hpp
similarity index 93%
rename from packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_IdentityReferenceUnaryFunctor.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IdentityReferenceUnaryFunctor.hpp
index d43a161fc..f41e567c9 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_IdentityReferenceUnaryFunctor.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IdentityReferenceUnaryFunctor.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_NUMERIC_IDENTITY_REFERENCE_UNARY_FUNCTOR_HPP
-#define KOKKOS_STD_NUMERIC_IDENTITY_REFERENCE_UNARY_FUNCTOR_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_NUMERIC_IDENTITY_REFERENCE_UNARY_FUNCTOR_HPP
+#define KOKKOS_STD_ALGORITHMS_NUMERIC_IDENTITY_REFERENCE_UNARY_FUNCTOR_HPP
 
 #include <Kokkos_Macros.hpp>
 
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
new file mode 100644
index 000000000..2088ebd43
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp
@@ -0,0 +1,243 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_INCLUSIVE_SCAN_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_INCLUSIVE_SCAN_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_TransformInclusiveScan.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <typename ValueType>
+using in_scan_has_reduction_identity_sum_t =
+    decltype(Kokkos::reduction_identity<ValueType>::sum());
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct InclusiveScanDefaultFunctorForKnownIdentityElement {
+  using execution_space = ExeSpace;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  InclusiveScanDefaultFunctorForKnownIdentityElement(FirstFrom first_from,
+                                                     FirstDest first_dest)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, ValueType& update,
+                  const bool final_pass) const {
+    update += m_first_from[i];
+
+    if (final_pass) {
+      m_first_dest[i] = update;
+    }
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest>
+struct InclusiveScanDefaultFunctor {
+  using execution_space = ExeSpace;
+  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+
+  KOKKOS_FUNCTION
+  InclusiveScanDefaultFunctor(FirstFrom first_from, FirstDest first_dest)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    const auto tmp = value_type{m_first_from[i], false};
+    this->join(update, tmp);
+
+    if (final_pass) {
+      m_first_dest[i] = update.val;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (update.is_initial) {
+      update.val = input.val;
+    } else {
+      update.val = update.val + input.val;
+    }
+    update.is_initial = false;
+  }
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType>
+OutputIteratorType inclusive_scan_default_op_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+  using func_type = std::conditional_t<
+      ::Kokkos::is_detected<in_scan_has_reduction_identity_sum_t,
+                            value_type>::value,
+      InclusiveScanDefaultFunctorForKnownIdentityElement<
+          ExecutionSpace, index_type, value_type, InputIteratorType,
+          OutputIteratorType>,
+      InclusiveScanDefaultFunctor<ExecutionSpace, index_type, value_type,
+                                  InputIteratorType, OutputIteratorType>>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(label,
+                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                          func_type(first_from, first_dest));
+  ex.fence("Kokkos::inclusive_scan_default_op: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// inclusive_scan_custom_binary_op_impl
+// -------------------------------------------------------------
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType>
+OutputIteratorType inclusive_scan_custom_binary_op_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, BinaryOpType binary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<value_type>;
+  using func_type     = TransformInclusiveScanNoInitValueFunctor<
+      ExecutionSpace, index_type, value_type, InputIteratorType,
+      OutputIteratorType, BinaryOpType, unary_op_type>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(first_from, first_dest, binary_op, unary_op_type()));
+  ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// inclusive_scan_custom_binary_op_impl with init_value
+// -------------------------------------------------------------
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class ValueType>
+OutputIteratorType inclusive_scan_custom_binary_op_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, BinaryOpType binary_op,
+    ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type    = typename InputIteratorType::difference_type;
+  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
+  using func_type     = TransformInclusiveScanWithInitValueFunctor<
+      ExecutionSpace, index_type, ValueType, InputIteratorType,
+      OutputIteratorType, BinaryOpType, unary_op_type>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(label,
+                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                          func_type(first_from, first_dest, binary_op,
+                                    unary_op_type(), init_value));
+  ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
new file mode 100644
index 000000000..0f00bebb6
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp
@@ -0,0 +1,148 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_PARTITIONED_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_PARTITIONED_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class ReducerType, class PredicateType>
+struct StdIsPartitionedFunctor {
+  using red_value_type = typename ReducerType::value_type;
+  using index_type     = typename IteratorType::difference_type;
+
+  IteratorType m_first;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& redValue) const {
+    const auto predicate_value = m_p(m_first[i]);
+    constexpr index_type m_red_id_min =
+        ::Kokkos::reduction_identity<index_type>::min();
+    constexpr index_type m_red_id_max =
+        ::Kokkos::reduction_identity<index_type>::max();
+    auto rv = predicate_value ? red_value_type{i, m_red_id_min}
+                              : red_value_type{m_red_id_max, i};
+
+    m_reducer.join(redValue, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdIsPartitionedFunctor(IteratorType first, ReducerType reducer,
+                          PredicateType p)
+      : m_first(std::move(first)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last,
+                         PredicateType pred) {
+  // true if all elements in the range [first, last) that satisfy
+  // the predicate "pred" appear before all elements that don't.
+  // Also returns true if [first, last) is empty.
+  // also true if all elements satisfy the predicate.
+
+  // we implement it by finding:
+  // - the max location where predicate is true  (max_loc_true)
+  // - the min location where predicate is false (min_loc_false)
+  // so the range is partitioned if max_loc_true < (min_loc_false)
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // trivial case
+  if (first == last) {
+    return true;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = StdIsPartitioned<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t =
+      StdIsPartitionedFunctor<IteratorType, reducer_type, PredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  constexpr index_type red_id_min =
+      ::Kokkos::reduction_identity<index_type>::min();
+  constexpr index_type red_id_max =
+      ::Kokkos::reduction_identity<index_type>::max();
+
+  if (red_result.max_loc_true != red_id_max &&
+      red_result.min_loc_false != red_id_min) {
+    return red_result.max_loc_true < red_result.min_loc_false;
+  } else if (first + red_result.max_loc_true == --last) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
new file mode 100644
index 000000000..4e36ae389
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_SORTED_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_SORTED_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class ComparatorType>
+struct StdIsSortedFunctor {
+  using index_type = typename IteratorType::difference_type;
+  IteratorType m_first;
+  ComparatorType m_comparator;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, std::size_t& update) const {
+    const auto& val_i   = m_first[i];
+    const auto& val_ip1 = m_first[i + 1];
+
+    if (m_comparator(val_ip1, val_i)) {
+      ++update;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdIsSortedFunctor(IteratorType _first1, ComparatorType comparator)
+      : m_first(std::move(_first1)), m_comparator(std::move(comparator)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last,
+                    ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements <= 1) {
+    return true;
+  }
+
+  // use num_elements-1 because each index handles i and i+1
+  const auto num_elements_minus_one = num_elements - 1;
+  using functor_type = StdIsSortedFunctor<IteratorType, ComparatorType>;
+
+  // result is incremented by one if sorting breaks at index i
+  std::size_t result = 0;
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_minus_one),
+      functor_type(first, std::move(comp)), result);
+
+  return result == 0;
+}
+
+template <class ExecutionSpace, class IteratorType>
+bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex,
+                    IteratorType first, IteratorType last) {
+  using value_type = typename IteratorType::value_type;
+  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
+  return is_sorted_impl(label, ex, first, last, pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
new file mode 100644
index 000000000..4e99c301b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp
@@ -0,0 +1,153 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_IS_SORTED_UNTIL_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_IS_SORTED_UNTIL_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <std_algorithms/Kokkos_Find.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class IndicatorViewType, class ComparatorType>
+struct StdIsSortedUntilFunctor {
+  using index_type = typename IteratorType::difference_type;
+  IteratorType m_first;
+  IndicatorViewType m_indicator;
+  ComparatorType m_comparator;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, int& update, const bool final) const {
+    const auto& val_i   = m_first[i];
+    const auto& val_ip1 = m_first[i + 1];
+
+    if (m_comparator(val_ip1, val_i)) {
+      ++update;
+    }
+
+    if (final) {
+      m_indicator(i) = update;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdIsSortedUntilFunctor(IteratorType _first1, IndicatorViewType indicator,
+                          ComparatorType comparator)
+      : m_first(std::move(_first1)),
+        m_indicator(std::move(indicator)),
+        m_comparator(std::move(comparator)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class ComparatorType>
+IteratorType is_sorted_until_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last, ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+
+  // trivial case
+  if (num_elements <= 1) {
+    return last;
+  }
+
+  /*
+    use scan and a helper "indicator" view
+    such that we scan the data and fill the indicator with
+    partial sum that is always 0 unless we find a pair that
+    breaks the sorting, so in that case the indicator will
+    have a 1 starting at the location where the sorting breaks.
+    So finding that 1 means finding the location we want.
+   */
+
+  // aliases
+  using indicator_value_type = std::size_t;
+  using indicator_view_type =
+      ::Kokkos::View<indicator_value_type*, ExecutionSpace>;
+  using functor_type =
+      StdIsSortedUntilFunctor<IteratorType, indicator_view_type,
+                              ComparatorType>;
+
+  // do scan
+  // use num_elements-1 because each index handles i and i+1
+  const auto num_elements_minus_one = num_elements - 1;
+  indicator_view_type indicator("is_sorted_until_indicator_helper",
+                                num_elements_minus_one);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_minus_one),
+      functor_type(first, indicator, std::move(comp)));
+
+  // try to find the first sentinel value, which indicates
+  // where the sorting condition breaks
+  namespace KE                                  = ::Kokkos::Experimental;
+  constexpr indicator_value_type sentinel_value = 1;
+  auto r =
+      KE::find(ex, KE::cbegin(indicator), KE::cend(indicator), sentinel_value);
+  const auto shift = r - ::Kokkos::Experimental::cbegin(indicator);
+
+  return first + (shift + 1);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType is_sorted_until_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last) {
+  using value_type = typename IteratorType::value_type;
+  using pred_t     = Impl::StdAlgoLessThanBinaryPredicate<value_type>;
+  return is_sorted_until_impl(label, ex, first, last, pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
new file mode 100644
index 000000000..c3dd13e6b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp
@@ -0,0 +1,184 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_LEXICOGRAPHICAL_COMPARE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_LEXICOGRAPHICAL_COMPARE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ComparatorType>
+struct StdCompareFunctor {
+  IteratorType1 m_it1;
+  IteratorType2 m_it2;
+  ComparatorType m_predicate;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType /* i is unused */, int& lsum) const {
+    if (m_predicate(*m_it1, *m_it2)) {
+      lsum = 1;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdCompareFunctor(IteratorType1 _it1, IteratorType2 _it2,
+                    ComparatorType _predicate)
+      : m_it1(std::move(_it1)),
+        m_it2(std::move(_it2)),
+        m_predicate(std::move(_predicate)) {}
+};
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class ComparatorType>
+struct StdLexicographicalCompareFunctor {
+  using red_value_type = typename ReducerType::value_type;
+  IteratorType1 m_first1;
+  IteratorType2 m_first2;
+  ReducerType m_reducer;
+  ComparatorType m_comparator;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    const auto& my_value1 = m_first1[i];
+    const auto& my_value2 = m_first2[i];
+
+    bool different = m_comparator(my_value1, my_value2) ||
+                     m_comparator(my_value2, my_value1);
+    auto rv =
+        different
+            ? red_value_type{i}
+            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdLexicographicalCompareFunctor(IteratorType1 _first1, IteratorType2 _first2,
+                                   ReducerType _reducer, ComparatorType _comp)
+      : m_first1(std::move(_first1)),
+        m_first2(std::move(_first2)),
+        m_reducer(std::move(_reducer)),
+        m_comparator(std::move(_comp)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class ComparatorType>
+bool lexicographical_compare_impl(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  IteratorType1 first1, IteratorType1 last1,
+                                  IteratorType2 first2, IteratorType2 last2,
+                                  ComparatorType comp) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  // aliases
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  const auto d1    = Kokkos::Experimental::distance(first1, last1);
+  const auto d2    = Kokkos::Experimental::distance(first2, last2);
+  const auto range = Kokkos::min(d1, d2);
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  using func1_t =
+      StdLexicographicalCompareFunctor<index_type, IteratorType1, IteratorType2,
+                                       reducer_type, ComparatorType>;
+
+  ::Kokkos::parallel_reduce(label, RangePolicy<ExecutionSpace>(ex, 0, range),
+                            func1_t(first1, first2, reducer, comp), reducer);
+
+  // fence not needed because reducing into scalar
+  // no mismatch
+  if (red_result.min_loc_true ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    auto new_last1 = first1 + range;
+    auto new_last2 = first2 + range;
+    bool is_prefix = (new_last1 == last1) && (new_last2 != last2);
+    return is_prefix;
+  }
+
+  // check mismatched
+  int less      = 0;
+  auto it1      = first1 + red_result.min_loc_true;
+  auto it2      = first2 + red_result.min_loc_true;
+  using func2_t = StdCompareFunctor<index_type, IteratorType1, IteratorType2,
+                                    ComparatorType>;
+  ::Kokkos::parallel_reduce(label, RangePolicy<ExecutionSpace>(ex, 0, 1),
+                            func2_t(it1, it2, comp), less);
+
+  // fence not needed because reducing into scalar
+  return static_cast<bool>(less);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+bool lexicographical_compare_impl(const std::string& label,
+                                  const ExecutionSpace& ex,
+                                  IteratorType1 first1, IteratorType1 last1,
+                                  IteratorType2 first2, IteratorType2 last2) {
+  using value_type_1 = typename IteratorType1::value_type;
+  using value_type_2 = typename IteratorType2::value_type;
+  using predicate_t =
+      Impl::StdAlgoLessThanBinaryPredicate<value_type_1, value_type_2>;
+  return lexicographical_compare_impl(label, ex, first1, last1, first2, last2,
+                                      predicate_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
new file mode 100644
index 000000000..0a9d41b9b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp
@@ -0,0 +1,167 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MIN_MAX_MINMAX_ELEMENT_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_MIN_MAX_MINMAX_ELEMENT_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class ReducerType>
+struct StdMinOrMaxElemFunctor {
+  using index_type     = typename IteratorType::difference_type;
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType m_first;
+  ReducerType m_reducer;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& red_value) const {
+    m_reducer.join(red_value, red_value_type{m_first[i], i});
+  }
+
+  KOKKOS_FUNCTION
+  StdMinOrMaxElemFunctor(IteratorType first, ReducerType reducer)
+      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
+};
+
+template <class IteratorType, class ReducerType>
+struct StdMinMaxElemFunctor {
+  using index_type     = typename IteratorType::difference_type;
+  using red_value_type = typename ReducerType::value_type;
+  IteratorType m_first;
+  ReducerType m_reducer;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& red_value) const {
+    const auto& my_value = m_first[i];
+    m_reducer.join(red_value, red_value_type{my_value, my_value, i, i});
+  }
+
+  KOKKOS_FUNCTION
+  StdMinMaxElemFunctor(IteratorType first, ReducerType reducer)
+      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
+};
+
+template <template <class... Args> class ReducerType, class ExecutionSpace,
+          class IteratorType, class... Args>
+IteratorType min_or_max_element_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     IteratorType first, IteratorType last,
+                                     Args&&... args) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using value_type           = typename IteratorType::value_type;
+  using reducer_type         = ReducerType<value_type, index_type, Args...>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t = StdMinOrMaxElemFunctor<IteratorType, reducer_type>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result, std::forward<Args>(args)...);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, reducer), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // return
+  return first + red_result.loc;
+}
+
+template <template <class... Args> class ReducerType, class ExecutionSpace,
+          class IteratorType, class... Args>
+::Kokkos::pair<IteratorType, IteratorType> minmax_element_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, Args&&... args) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return {first, first};
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using value_type           = typename IteratorType::value_type;
+  using reducer_type         = ReducerType<value_type, index_type, Args...>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t               = StdMinMaxElemFunctor<IteratorType, reducer_type>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result, std::forward<Args>(args)...);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, reducer), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // return
+  return {first + red_result.min_loc, first + red_result.max_loc};
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
new file mode 100644
index 000000000..180afe925
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Mismatch.hpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MISMATCH_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_MISMATCH_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class BinaryPredicateType>
+struct StdMismatchRedFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType1 m_first1;
+  IteratorType2 m_first2;
+  ReducerType m_reducer;
+  BinaryPredicateType m_predicate;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    const auto& my_value1 = m_first1[i];
+    const auto& my_value2 = m_first2[i];
+
+    auto rv =
+        !m_predicate(my_value1, my_value2)
+            ? red_value_type{i}
+            : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdMismatchRedFunctor(IteratorType1 first1, IteratorType2 first2,
+                        ReducerType reducer, BinaryPredicateType predicate)
+      : m_first1(std::move(first1)),
+        m_first2(std::move(first2)),
+        m_reducer(std::move(reducer)),
+        m_predicate(std::move(predicate)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2,
+    BinaryPredicateType predicate) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+  Impl::expect_valid_range(first2, last2);
+
+  // aliases
+  using return_type          = ::Kokkos::pair<IteratorType1, IteratorType2>;
+  using index_type           = typename IteratorType1::difference_type;
+  using reducer_type         = FirstLoc<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using functor_type =
+      StdMismatchRedFunctor<index_type, IteratorType1, IteratorType2,
+                            reducer_type, BinaryPredicateType>;
+
+  // trivial case: note that this is important,
+  // for OpenMPTarget, omitting special handling of
+  // the trivial case was giving all sorts of strange stuff.
+  const auto num_e1 = last1 - first1;
+  const auto num_e2 = last2 - first2;
+  if (num_e1 == 0 || num_e2 == 0) {
+    return return_type(first1, first2);
+  }
+
+  // run
+  const auto num_elemen_par_reduce = (num_e1 <= num_e2) ? num_e1 : num_e2;
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elemen_par_reduce),
+      functor_type(first1, first2, reducer, std::move(predicate)), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  constexpr auto red_min = ::Kokkos::reduction_identity<index_type>::min();
+  if (red_result.min_loc_true == red_min) {
+    // in here means mismatch has not been found
+    if (num_e1 == num_e2) {
+      return return_type(last1, last2);
+    } else if (num_e1 < num_e2) {
+      return return_type(last1, first2 + num_e1);
+    } else {
+      return return_type(first1 + num_e2, last2);
+    }
+  } else {
+    // in here means mismatch has been found
+    return return_type(first1 + red_result.min_loc_true,
+                       first2 + red_result.min_loc_true);
+  }
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+::Kokkos::pair<IteratorType1, IteratorType2> mismatch_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) {
+  using value_type1 = typename IteratorType1::value_type;
+  using value_type2 = typename IteratorType2::value_type;
+  using pred_t      = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return mismatch_impl(label, ex, first1, last1, first2, last2, pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
new file mode 100644
index 000000000..6b1ed1da4
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Move.hpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MOVE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_MOVE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator>
+struct StdMoveFunctor {
+  InputIterator m_first;
+  OutputIterator m_dest_first;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    m_dest_first[i] = std::move(m_first[i]);
+  }
+
+  StdMoveFunctor(InputIterator _first, OutputIterator _dest_first)
+      : m_first(std::move(_first)), m_dest_first(std::move(_dest_first)) {}
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator move_impl(const std::string& label, const ExecutionSpace& ex,
+                         InputIterator first, InputIterator last,
+                         OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_t     = StdMoveFunctor<index_type, InputIterator, OutputIterator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first, d_first));
+  ex.fence("Kokkos::move: fence after operation");
+
+  // return
+  return d_first + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
new file mode 100644
index 000000000..c34ab679d
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_MOVE_BACKWARD_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_MOVE_BACKWARD_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2>
+struct StdMoveBackwardFunctor {
+  static_assert(std::is_signed<IndexType>::value,
+                "Kokkos: StdMoveBackwardFunctor requires signed index type");
+
+  IteratorType1 m_last;
+  IteratorType2 m_dest_last;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    m_dest_last[-i - 1] = std::move(m_last[-i - 1]);
+  }
+
+  StdMoveBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
+      : m_last(std::move(_last)), m_dest_last(std::move(_dest_last)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 move_backward_impl(const std::string& label,
+                                 const ExecutionSpace& ex, IteratorType1 first,
+                                 IteratorType1 last, IteratorType2 d_last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_last);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using func_t =
+      StdMoveBackwardFunctor<index_type, IteratorType1, IteratorType2>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(last, d_last));
+  ex.fence("Kokkos::move_backward: fence after operation");
+
+  // return
+  return d_last - num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
new file mode 100644
index 000000000..508e4baed
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionCopy.hpp
@@ -0,0 +1,180 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_PARTITION_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_PARTITION_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ValueType>
+struct StdPartitionCopyScalar {
+  ValueType true_count_;
+  ValueType false_count_;
+
+  // Here we implement the copy assignment operators explicitly for consistency
+  // with how the Scalar structs are implemented inside
+  // Kokkos_Parallel_Reduce.hpp.
+  KOKKOS_FUNCTION
+  void operator=(const StdPartitionCopyScalar& other) {
+    true_count_  = other.true_count_;
+    false_count_ = other.false_count_;
+  }
+
+  // this is needed for
+  // OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp:699:21: error: no viable
+  // overloaded '=' m_returnvalue = 0;
+  //
+  KOKKOS_FUNCTION
+  void operator=(const ValueType value) {
+    true_count_  = value;
+    false_count_ = value;
+  }
+};
+
+template <class IndexType, class FirstFrom, class FirstDestTrue,
+          class FirstDestFalse, class PredType>
+struct StdPartitionCopyFunctor {
+  using value_type = StdPartitionCopyScalar<IndexType>;
+
+  FirstFrom m_first_from;
+  FirstDestTrue m_first_dest_true;
+  FirstDestFalse m_first_dest_false;
+  PredType m_pred;
+
+  KOKKOS_FUNCTION
+  StdPartitionCopyFunctor(FirstFrom first_from, FirstDestTrue first_dest_true,
+                          FirstDestFalse first_dest_false, PredType pred)
+      : m_first_from(std::move(first_from)),
+        m_first_dest_true(std::move(first_dest_true)),
+        m_first_dest_false(std::move(first_dest_false)),
+        m_pred(std::move(pred)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    const auto& myval = m_first_from[i];
+    if (final_pass) {
+      if (m_pred(myval)) {
+        m_first_dest_true[update.true_count_] = myval;
+      } else {
+        m_first_dest_false[update.false_count_] = myval;
+      }
+    }
+
+    if (m_pred(myval)) {
+      update.true_count_ += 1;
+    } else {
+      update.false_count_ += 1;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.true_count_  = 0;
+    update.false_count_ = 0;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    update.true_count_ += input.true_count_;
+    update.false_count_ += input.false_count_;
+  }
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorTrueType, class OutputIteratorFalseType,
+          class PredicateType>
+::Kokkos::pair<OutputIteratorTrueType, OutputIteratorFalseType>
+partition_copy_impl(const std::string& label, const ExecutionSpace& ex,
+                    InputIteratorType from_first, InputIteratorType from_last,
+                    OutputIteratorTrueType to_first_true,
+                    OutputIteratorFalseType to_first_false,
+                    PredicateType pred) {
+  // impl uses a scan, this is similar how we implemented copy_if
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(
+      ex, from_first, to_first_true, to_first_false);
+  Impl::static_assert_iterators_have_matching_difference_type(
+      from_first, to_first_true, to_first_false);
+  Impl::expect_valid_range(from_first, from_last);
+
+  if (from_first == from_last) {
+    return {to_first_true, to_first_false};
+  }
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type =
+      StdPartitionCopyFunctor<index_type, InputIteratorType,
+                              OutputIteratorTrueType, OutputIteratorFalseType,
+                              PredicateType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(from_first, from_last);
+  typename func_type::value_type counts{0, 0};
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(from_first, to_first_true, to_first_false, pred), counts);
+
+  // fence not needed here because of the scan into counts
+
+  return {to_first_true + counts.true_count_,
+          to_first_false + counts.false_count_};
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
new file mode 100644
index 000000000..671e8d70f
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_PartitionPoint.hpp
@@ -0,0 +1,132 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_PARTITION_POINT_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_PARTITION_POINT_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class ReducerType, class PredicateType>
+struct StdPartitionPointFunctor {
+  using red_value_type = typename ReducerType::value_type;
+  using index_type     = typename IteratorType::difference_type;
+
+  IteratorType m_first;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& redValue) const {
+    const auto predicate_value = m_p(m_first[i]);
+    auto rv =
+        predicate_value
+            ? red_value_type{::Kokkos::reduction_identity<index_type>::min()}
+            : red_value_type{i};
+    m_reducer.join(redValue, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdPartitionPointFunctor(IteratorType first, ReducerType reducer,
+                           PredicateType p)
+      : m_first(std::move(first)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+IteratorType partition_point_impl(const std::string& label,
+                                  const ExecutionSpace& ex, IteratorType first,
+                                  IteratorType last, PredicateType pred) {
+  // locates the end of the first partition, that is, the first
+  // element that does not satisfy p or last if all elements satisfy p.
+  // Implementation below finds the first location where p is false.
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return first;
+  }
+
+  // aliases
+  using index_type           = typename IteratorType::difference_type;
+  using reducer_type         = StdPartitionPoint<index_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+  using func_t =
+      StdPartitionPointFunctor<IteratorType, reducer_type, PredicateType>;
+
+  // run
+  reduction_value_type red_result;
+  reducer_type reducer(red_result);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            func_t(first, reducer, pred), reducer);
+
+  // fence not needed because reducing into scalar
+
+  // decide and return
+  if (red_result.min_loc_false ==
+      ::Kokkos::reduction_identity<index_type>::min()) {
+    // if all elements are true, return last
+    return last;
+  } else {
+    return first + red_result.min_loc_false;
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RandomAccessIterator.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp
similarity index 88%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_RandomAccessIterator.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp
index 01c0d7672..2457d9400 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_RandomAccessIterator.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_RANDOM_ACCESS_ITERATOR_HPP
-#define KOKKOS_RANDOM_ACCESS_ITERATOR_HPP
+#ifndef KOKKOS_RANDOM_ACCESS_ITERATOR_IMPL_HPP
+#define KOKKOS_RANDOM_ACCESS_ITERATOR_IMPL_HPP
 
 #include <iterator>
 #include <Kokkos_Macros.hpp>
@@ -58,18 +58,16 @@ template <class T>
 class RandomAccessIterator;
 
 template <class DataType, class... Args>
-class RandomAccessIterator< ::Kokkos::View<DataType, Args...> >
-    : public std::iterator<
-          std::random_access_iterator_tag,
-          typename ::Kokkos::View<DataType, Args...>::value_type, ptrdiff_t,
-          typename ::Kokkos::View<DataType, Args...>::pointer_type,
-          typename ::Kokkos::View<DataType, Args...>::reference_type> {
+class RandomAccessIterator< ::Kokkos::View<DataType, Args...> > {
  public:
-  using view_type       = ::Kokkos::View<DataType, Args...>;
-  using iterator_type   = RandomAccessIterator<view_type>;
-  using difference_type = ptrdiff_t;
-  using value_type      = typename view_type::value_type;
-  using reference       = typename view_type::reference_type;
+  using view_type     = ::Kokkos::View<DataType, Args...>;
+  using iterator_type = RandomAccessIterator<view_type>;
+
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type        = typename view_type::value_type;
+  using difference_type   = ptrdiff_t;
+  using pointer           = typename view_type::pointer_type;
+  using reference         = typename view_type::reference_type;
 
   static_assert(view_type::rank == 1 &&
                     (std::is_same<typename view_type::traits::array_layout,
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
new file mode 100644
index 000000000..26e0795d8
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp
@@ -0,0 +1,186 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REDUCE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REDUCE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IteratorType, class ValueType>
+struct StdReduceDefaultFunctor {
+  using index_type = typename IteratorType::difference_type;
+
+  const IteratorType m_first;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, ValueType& update) const {
+    update += m_first[i];
+  }
+};
+
+template <class ValueType>
+struct StdReduceDefaultJoinFunctor {
+  KOKKOS_FUNCTION
+  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return a + b;
+  }
+};
+
+template <class IteratorType, class ReducerType>
+struct StdReduceFunctor {
+  using red_value_type = typename ReducerType::value_type;
+  using index_type     = typename IteratorType::difference_type;
+
+  const IteratorType m_first;
+  const ReducerType m_reducer;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& red_value) const {
+    auto tmp_wrapped_value = red_value_type{m_first[i], false};
+
+    if (red_value.is_initial) {
+      red_value = tmp_wrapped_value;
+    } else {
+      m_reducer.join(red_value, tmp_wrapped_value);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdReduceFunctor(IteratorType first, ReducerType reducer)
+      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
+};
+
+//------------------------------
+// reduce_custom_functors_impl
+//------------------------------
+template <class ExecutionSpace, class IteratorType, class ValueType,
+          class JoinerType>
+ValueType reduce_custom_functors_impl(const std::string& label,
+                                      const ExecutionSpace& ex,
+                                      IteratorType first, IteratorType last,
+                                      ValueType init_reduction_value,
+                                      JoinerType joiner) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using functor_type         = StdReduceFunctor<IteratorType, reducer_type>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            functor_type(first, reducer), reducer);
+
+  // fence not needed since reducing into scalar
+  return joiner(result.val, init_reduction_value);
+}
+
+template <typename ValueType>
+using has_reduction_identity_sum_t =
+    decltype(Kokkos::reduction_identity<ValueType>::sum());
+
+template <class ExecutionSpace, class IteratorType, class ValueType>
+ValueType reduce_default_functors_impl(const std::string& label,
+                                       const ExecutionSpace& ex,
+                                       IteratorType first, IteratorType last,
+                                       ValueType init_reduction_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::expect_valid_range(first, last);
+
+  using value_type = Kokkos::Impl::remove_cvref_t<ValueType>;
+
+  if (::Kokkos::is_detected<has_reduction_identity_sum_t, value_type>::value) {
+    if (first == last) {
+      // init is returned, unmodified
+      return init_reduction_value;
+    }
+
+    using functor_type =
+        Impl::StdReduceDefaultFunctor<IteratorType, value_type>;
+
+    // run
+    value_type tmp;
+    const auto num_elements = Kokkos::Experimental::distance(first, last);
+    ::Kokkos::parallel_reduce(label,
+                              RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                              functor_type{first}, tmp);
+    // fence not needed since reducing into scalar
+    tmp += init_reduction_value;
+    return tmp;
+  } else {
+    using joiner_type = Impl::StdReduceDefaultJoinFunctor<value_type>;
+    return reduce_custom_functors_impl(
+        label, ex, first, last, std::move(init_reduction_value), joiner_type());
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp
similarity index 91%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp
index dd529a25c..d8e383b85 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReducerWithArbitraryJoinerNoNeutralElement.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_ReducerWithArbitraryJoinerNoNeutralElement_hpp_
-#define KOKKOS_STD_ReducerWithArbitraryJoinerNoNeutralElement_hpp_
+#ifndef KOKKOS_STD_ALGORITHMS_REDUCER_WITH_ARBITRARY_JOINER_NONEUTRAL_ELEMENT_HPP
+#define KOKKOS_STD_ALGORITHMS_REDUCER_WITH_ARBITRARY_JOINER_NONEUTRAL_ELEMENT_HPP
 
 #include <Kokkos_Core.hpp>
 #include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
@@ -58,7 +58,7 @@ namespace Impl {
 
 template <class Scalar, class JoinerType, class Space = HostSpace>
 struct ReducerWithArbitraryJoinerNoNeutralElement {
-  using scalar_type = typename std::remove_cv<Scalar>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
 
  public:
   // Required
@@ -90,11 +90,6 @@ struct ReducerWithArbitraryJoinerNoNeutralElement {
     dest.val = m_joiner(dest.val, src.val);
   }
 
-  KOKKOS_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest.val = m_joiner(dest.val, src.val);
-  }
-
   KOKKOS_FUNCTION
   void init(value_type& val) const {
     // I cannot call reduction_identity, so need to default this
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
new file mode 100644
index 000000000..742d4d776
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp
@@ -0,0 +1,212 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REMOVE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REMOVE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <std_algorithms/Kokkos_CountIf.hpp>
+#include <std_algorithms/Kokkos_CopyIf.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class FirstFrom, class FirstDest, class PredType>
+struct StdRemoveIfStage1Functor {
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  PredType m_must_remove;
+
+  KOKKOS_FUNCTION
+  StdRemoveIfStage1Functor(FirstFrom first_from, FirstDest first_dest,
+                           PredType pred)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_must_remove(std::move(pred)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, IndexType& update,
+                  const bool final_pass) const {
+    auto& myval = m_first_from[i];
+    if (final_pass) {
+      if (!m_must_remove(myval)) {
+        // calling move here is ok because we are inside final pass
+        // we are calling move assign as specified by the std
+        m_first_dest[update] = std::move(myval);
+      }
+    }
+
+    if (!m_must_remove(myval)) {
+      update += 1;
+    }
+  }
+};
+
+template <class IndexType, class InputIteratorType, class OutputIteratorType>
+struct StdRemoveIfStage2Functor {
+  InputIteratorType m_first_from;
+  OutputIteratorType m_first_to;
+
+  KOKKOS_FUNCTION
+  StdRemoveIfStage2Functor(InputIteratorType first_from,
+                           OutputIteratorType first_to)
+      : m_first_from(std::move(first_from)), m_first_to(std::move(first_to)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i) const {
+    m_first_to[i] = std::move(m_first_from[i]);
+  }
+};
+
+template <class ExecutionSpace, class IteratorType, class UnaryPredicateType>
+IteratorType remove_if_impl(const std::string& label, const ExecutionSpace& ex,
+                            IteratorType first, IteratorType last,
+                            UnaryPredicateType pred) {
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    return last;
+  } else {
+    // create tmp buffer to use to *move* all elements that we need to keep.
+    // note that the tmp buffer is just large enought to store
+    // all elements to keep, because ideally we do not need/want one
+    // as large as the original range.
+    // To allocate the right tmp view, we need a call to count_if.
+    // We could just do a "safe" allocation of a buffer as
+    // large as (last-first), but I think a call to count_if is more afforable.
+
+    // count how many elements we need to keep
+    // note that the elements to remove are those that meet the predicate
+    const auto remove_count =
+        ::Kokkos::Experimental::count_if(ex, first, last, pred);
+    const auto keep_count =
+        Kokkos::Experimental::distance(first, last) - remove_count;
+
+    // create helper tmp view
+    using value_type    = typename IteratorType::value_type;
+    using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+    tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count);
+    using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+
+    // in stage 1, *move* all elements to keep from original range to tmp
+    // we use similar impl as copy_if except that we *move* rather than copy
+    using index_type = typename IteratorType::difference_type;
+    using func1_type = StdRemoveIfStage1Functor<index_type, IteratorType,
+                                                tmp_readwrite_iterator_type,
+                                                UnaryPredicateType>;
+
+    const auto scan_num_elements = Kokkos::Experimental::distance(first, last);
+    index_type scan_count        = 0;
+    ::Kokkos::parallel_scan(
+        label, RangePolicy<ExecutionSpace>(ex, 0, scan_num_elements),
+        func1_type(first, begin(tmp_view), pred), scan_count);
+
+    // scan_count should be equal to keep_count
+    assert(scan_count == keep_count);
+    (void)scan_count;  // to avoid unused complaints
+
+    // stage 2, we do parfor to move from tmp to original range
+    using func2_type =
+        StdRemoveIfStage2Functor<index_type, tmp_readwrite_iterator_type,
+                                 IteratorType>;
+    ::Kokkos::parallel_for(
+        "remove_if_stage2_parfor",
+        RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+        func2_type(begin(tmp_view), first));
+    ex.fence("Kokkos::remove_if: fence after stage2");
+
+    // return
+    return first + keep_count;
+  }
+}
+
+template <class ExecutionSpace, class IteratorType, class ValueType>
+auto remove_impl(const std::string& label, const ExecutionSpace& ex,
+                 IteratorType first, IteratorType last,
+                 const ValueType& value) {
+  using predicate_type = StdAlgoEqualsValUnaryPredicate<ValueType>;
+  return remove_if_impl(label, ex, first, last, predicate_type(value));
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+auto remove_copy_impl(const std::string& label, const ExecutionSpace& ex,
+                      InputIteratorType first_from, InputIteratorType last_from,
+                      OutputIteratorType first_dest, const ValueType& value) {
+  // this is like copy_if except that we need to *ignore* the elements
+  // that match the value, so we can solve this as follows:
+
+  using predicate_type = StdAlgoNotEqualsValUnaryPredicate<ValueType>;
+  return ::Kokkos::Experimental::copy_if(label, ex, first_from, last_from,
+                                         first_dest, predicate_type(value));
+}
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class UnaryPredicate>
+auto remove_copy_if_impl(const std::string& label, const ExecutionSpace& ex,
+                         InputIteratorType first_from,
+                         InputIteratorType last_from,
+                         OutputIteratorType first_dest,
+                         const UnaryPredicate& pred) {
+  // this is like copy_if except that we need to *ignore* the elements
+  // satisfying the pred, so we can solve this as follows:
+
+  using value_type = typename InputIteratorType::value_type;
+  using pred_wrapper_type =
+      StdAlgoNegateUnaryPredicateWrapper<value_type, UnaryPredicate>;
+  return ::Kokkos::Experimental::copy_if(label, ex, first_from, last_from,
+                                         first_dest, pred_wrapper_type(pred));
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
new file mode 100644
index 000000000..877ffa276
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Replace.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class InputIterator, class ValueType>
+struct StdReplaceFunctor {
+  using index_type = typename InputIterator::difference_type;
+  InputIterator m_first;
+  ValueType m_old_value;
+  ValueType m_new_value;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const {
+    if (m_first[i] == m_old_value) {
+      m_first[i] = m_new_value;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdReplaceFunctor(InputIterator first, ValueType old_value,
+                    ValueType new_value)
+      : m_first(std::move(first)),
+        m_old_value(std::move(old_value)),
+        m_new_value(std::move(new_value)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class ValueType>
+void replace_impl(const std::string& label, const ExecutionSpace& ex,
+                  IteratorType first, IteratorType last,
+                  const ValueType& old_value, const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using func_t = StdReplaceFunctor<IteratorType, ValueType>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first, old_value, new_value));
+  ex.fence("Kokkos::replace: fence after operation");
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
new file mode 100644
index 000000000..b75dde9cd
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopy.hpp
@@ -0,0 +1,122 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class InputIterator, class OutputIterator, class ValueType>
+struct StdReplaceCopyFunctor {
+  using index_type = typename InputIterator::difference_type;
+
+  InputIterator m_first_from;
+  OutputIterator m_first_dest;
+  ValueType m_old_value;
+  ValueType m_new_value;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const {
+    const auto& myvalue_from = m_first_from[i];
+
+    if (myvalue_from == m_old_value) {
+      m_first_dest[i] = m_new_value;
+    } else {
+      m_first_dest[i] = myvalue_from;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdReplaceCopyFunctor(InputIterator first_from, OutputIterator first_dest,
+                        ValueType old_value, ValueType new_value)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_old_value(std::move(old_value)),
+        m_new_value(std::move(new_value)) {}
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType>
+OutputIteratorType replace_copy_impl(const std::string& label,
+                                     const ExecutionSpace& ex,
+                                     InputIteratorType first_from,
+                                     InputIteratorType last_from,
+                                     OutputIteratorType first_dest,
+                                     const ValueType& old_value,
+                                     const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using func_t =
+      StdReplaceCopyFunctor<InputIteratorType, OutputIteratorType, ValueType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first_from, first_dest, old_value, new_value));
+  ex.fence("Kokkos::replace_copy: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
new file mode 100644
index 000000000..8f7c8140e
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceCopyIf.hpp
@@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_COPY_IF_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator,
+          class PredicateType, class ValueType>
+struct StdReplaceIfCopyFunctor {
+  InputIterator m_first_from;
+  OutputIterator m_first_dest;
+  PredicateType m_pred;
+  ValueType m_new_value;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    const auto& myvalue_from = m_first_from[i];
+
+    if (m_pred(myvalue_from)) {
+      m_first_dest[i] = m_new_value;
+    } else {
+      m_first_dest[i] = myvalue_from;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdReplaceIfCopyFunctor(InputIterator first_from, OutputIterator first_dest,
+                          PredicateType pred, ValueType new_value)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_pred(std::move(pred)),
+        m_new_value(std::move(new_value)) {}
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class PredicateType, class ValueType>
+OutputIteratorType replace_copy_if_impl(const std::string& label,
+                                        const ExecutionSpace& ex,
+                                        InputIteratorType first_from,
+                                        InputIteratorType last_from,
+                                        OutputIteratorType first_dest,
+                                        PredicateType pred,
+                                        const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using func_t =
+      StdReplaceIfCopyFunctor<index_type, InputIteratorType, OutputIteratorType,
+                              PredicateType, ValueType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_t(first_from, first_dest, std::move(pred), new_value));
+  ex.fence("Kokkos::replace_copy_if: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
new file mode 100644
index 000000000..6fe33019c
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReplaceIf.hpp
@@ -0,0 +1,105 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REPLACE_IF_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REPLACE_IF_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class InputIterator, class PredicateType, class NewValueType>
+struct StdReplaceIfFunctor {
+  using index_type = typename InputIterator::difference_type;
+
+  InputIterator m_first;
+  PredicateType m_predicate;
+  NewValueType m_new_value;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const {
+    if (m_predicate(m_first[i])) {
+      m_first[i] = m_new_value;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdReplaceIfFunctor(InputIterator first, PredicateType pred,
+                      NewValueType new_value)
+      : m_first(std::move(first)),
+        m_predicate(std::move(pred)),
+        m_new_value(std::move(new_value)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class PredicateType,
+          class ValueType>
+void replace_if_impl(const std::string& label, const ExecutionSpace& ex,
+                     IteratorType first, IteratorType last, PredicateType pred,
+                     const ValueType& new_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using func_t = StdReplaceIfFunctor<IteratorType, PredicateType, ValueType>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first, std::move(pred), new_value));
+  ex.fence("Kokkos::replace_if: fence after operation");
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
new file mode 100644
index 000000000..f84eb2c81
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REVERSE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REVERSE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <std_algorithms/Kokkos_Swap.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class InputIterator>
+struct StdReverseFunctor {
+  using index_type = typename InputIterator::difference_type;
+  static_assert(std::is_signed<index_type>::value,
+                "Kokkos: StdReverseFunctor requires signed index type");
+
+  InputIterator m_first;
+  InputIterator m_last;
+
+  KOKKOS_FUNCTION
+  void operator()(index_type i) const {
+    // the swap below is doing the same thing, but
+    // for Intel 18.0.5 does not work.
+    // But putting the impl directly here, it works.
+#ifdef KOKKOS_COMPILER_INTEL
+    typename InputIterator::value_type tmp = std::move(m_first[i]);
+    m_first[i]                             = std::move(m_last[-i - 1]);
+    m_last[-i - 1]                         = std::move(tmp);
+#else
+    ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]);
+#endif
+  }
+
+  StdReverseFunctor(InputIterator first, InputIterator last)
+      : m_first(std::move(first)), m_last(std::move(last)) {}
+};
+
+template <class ExecutionSpace, class InputIterator>
+void reverse_impl(const std::string& label, const ExecutionSpace& ex,
+                  InputIterator first, InputIterator last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using func_t = StdReverseFunctor<InputIterator>;
+
+  // run
+  if (last >= first + 2) {
+    // only need half
+    const auto num_elements = Kokkos::Experimental::distance(first, last) / 2;
+    ::Kokkos::parallel_for(label,
+                           RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                           func_t(first, last));
+    ex.fence("Kokkos::reverse: fence after operation");
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
new file mode 100644
index 000000000..88b6ed16b
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp
@@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_REVERSE_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_REVERSE_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator>
+struct StdReverseCopyFunctor {
+  static_assert(std::is_signed<IndexType>::value,
+                "Kokkos: StdReverseCopyFunctor requires signed index type");
+
+  InputIterator m_last;
+  OutputIterator m_dest_first;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const { m_dest_first[i] = m_last[-1 - i]; }
+
+  StdReverseCopyFunctor(InputIterator _last, OutputIterator _dest_first)
+      : m_last(std::move(_last)), m_dest_first(std::move(_dest_first)) {}
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator reverse_copy_impl(const std::string& label,
+                                 const ExecutionSpace& ex, InputIterator first,
+                                 InputIterator last, OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_t =
+      StdReverseCopyFunctor<index_type, InputIterator, OutputIterator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(last, d_first));
+  ex.fence("Kokkos::reverse_copy: fence after operation");
+
+  // return
+  return d_first + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
new file mode 100644
index 000000000..c08cf1aec
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Rotate.hpp
@@ -0,0 +1,219 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ROTATE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_ROTATE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Move.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType rotate_with_pivot_in_left_half(const std::string& label,
+                                            const ExecutionSpace& ex,
+                                            IteratorType first,
+                                            IteratorType n_first,
+                                            IteratorType last) {
+  /*
+    This impl is specific for when the n_first iterator points to
+    an element that is before or equal to the middle of the range.
+
+    If we have:
+
+    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 | *
+      ^           ^              mid					   ^
+    first       n_first							  last
+
+    In step 1, we create a temporary view with extent = distance(n_first, last)
+    and *move* the elements from [n_first, last) to tmp view, such that
+    tmp view becomes:
+
+    | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 |
+
+    In step 2, we move the elements in [first, n_first)
+    to the new position where they are supposed to end up.
+
+    In step 3, we move the elements from the tmp view to
+    the range starting at first.
+   */
+
+  namespace KE                     = ::Kokkos::Experimental;
+  const auto num_elements_on_left  = KE::distance(first, n_first);
+  const auto num_elements_on_right = KE::distance(n_first, last);
+
+  // create helper tmp view
+  using value_type    = typename IteratorType::value_type;
+  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+  tmp_view_type tmp_view("rotate_impl_for_pivot_in_left_half_impl",
+                         num_elements_on_right);
+  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+
+  // index_type is the same and needed in all steps
+  using index_type = typename IteratorType::difference_type;
+
+  // stage 1
+  using step1_func_type =
+      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_right),
+      step1_func_type(n_first, begin(tmp_view)));
+
+  // stage 2
+  using step2_func_type =
+      StdMoveFunctor<index_type, IteratorType, IteratorType>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_left),
+      step2_func_type(first, first + num_elements_on_right));
+
+  // step 3
+  using step3_func_type =
+      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+                         step3_func_type(begin(tmp_view), first));
+
+  ex.fence("Kokkos::rotate: fence after operation");
+  return first + (last - n_first);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType rotate_with_pivot_in_right_half(const std::string& label,
+                                             const ExecutionSpace& ex,
+                                             IteratorType first,
+                                             IteratorType n_first,
+                                             IteratorType last) {
+  /*
+    This impl is specific for when the n_first iterator points to
+    an element that is after the middle of the range.
+
+    If we have:
+
+    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 | *
+      ^                          mid            ^                          ^
+    first                                    n_first			  last
+
+    In step 1, we create a temporary view with extent = distance(first, n_first)
+    and *move* the elements from [first, n_first) to tmp view,
+    such that tmp view becomes:
+
+    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 |
+
+    In step 2, we move the elements in [n_first, last)
+    to the beginning where they are supposed to end up.
+
+    In step 3, we move the elements from the tmp view to
+    the range starting at first.
+   */
+
+  namespace KE                     = ::Kokkos::Experimental;
+  const auto num_elements_on_left  = KE::distance(first, n_first);
+  const auto num_elements_on_right = KE::distance(n_first, last);
+
+  // create helper tmp view
+  using value_type    = typename IteratorType::value_type;
+  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+  tmp_view_type tmp_view("rotate_impl_for_pivot_in_left_half_impl",
+                         num_elements_on_left);
+  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+
+  // index_type is the same and needed in all steps
+  using index_type = typename IteratorType::difference_type;
+
+  // stage 1
+  using step1_func_type =
+      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_left),
+      step1_func_type(first, begin(tmp_view)));
+
+  // stage 2
+  using step2_func_type =
+      StdMoveFunctor<index_type, IteratorType, IteratorType>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_right),
+      step2_func_type(n_first, first));
+
+  // step 3:
+  using step3_func_type =
+      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+      step3_func_type(begin(tmp_view), first + num_elements_on_right));
+
+  ex.fence("Kokkos::rotate: fence after operation");
+  return first + (last - n_first);
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType rotate_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType n_first,
+                         IteratorType last) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(first, n_first);
+  Impl::expect_valid_range(n_first, last);
+
+  namespace KE                     = ::Kokkos::Experimental;
+  const auto num_elements          = KE::distance(first, last);
+  const auto n_distance_from_first = KE::distance(first, n_first);
+  if (n_distance_from_first <= num_elements / 2) {
+    return rotate_with_pivot_in_left_half(label, ex, first, n_first, last);
+  } else {
+    return rotate_with_pivot_in_right_half(label, ex, first, n_first, last);
+  }
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
new file mode 100644
index 000000000..a8111a511
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RotateCopy.hpp
@@ -0,0 +1,149 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_ROTATE_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_ROTATE_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator>
+struct StdRotateCopyFunctor {
+  InputIterator m_first;
+  InputIterator m_last;
+  InputIterator m_first_n;
+  OutputIterator m_dest_first;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    const IndexType shift = m_last - m_first_n;
+
+    if (i < shift) {
+      m_dest_first[i] = m_first_n[i];
+    } else {
+      m_dest_first[i] = m_first[i - shift];
+    }
+  }
+
+  StdRotateCopyFunctor(InputIterator first, InputIterator last,
+                       InputIterator first_n, OutputIterator dest_first)
+      : m_first(std::move(first)),
+        m_last(std::move(last)),
+        m_first_n(std::move(first_n)),
+        m_dest_first(std::move(dest_first)) {}
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator rotate_copy_impl(const std::string& label,
+                                const ExecutionSpace& ex, InputIterator first,
+                                InputIterator n_first, InputIterator last,
+                                OutputIterator d_first) {
+  /*
+    algorithm is implemented as follows:
+
+    first 	   n_first		last
+    |		      |                  |
+    o  o  o  o  o  o  o  o  o  o  o  o
+
+    dest+0 -> first_n
+    dest+1 -> first_n+1
+    dest+2 -> first_n+2
+    dest+3 -> first
+    dest+4 -> first+1
+    dest+5 -> first+2
+    dest+6 -> first+3
+    dest+7 -> first+4
+    dest+8 -> first+5
+    ...
+    let shift = last - first_n;
+
+    then we have:
+    if (i < shift){
+      *(dest_first + i) = *(first_n + i);
+    }
+    else{
+      *(dest_first + i) = *(from + i - shift);
+    }
+  */
+
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(first, n_first);
+  Impl::expect_valid_range(n_first, last);
+
+  if (first == last) {
+    return d_first;
+  }
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_type =
+      StdRotateCopyFunctor<index_type, InputIterator, OutputIterator>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_type(first, last, n_first, d_first));
+
+  ex.fence("Kokkos::rotate_copy: fence after operation");
+
+  // return
+  return d_first + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
new file mode 100644
index 000000000..2afb0a74f
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Search.hpp
@@ -0,0 +1,191 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SEARCH_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_SEARCH_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Equal.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class PredicateType>
+struct StdSearchFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType1 m_first;
+  IteratorType1 m_last;
+  IteratorType2 m_s_first;
+  IteratorType2 m_s_last;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    namespace KE = ::Kokkos::Experimental;
+    auto myit    = m_first + i;
+    bool found   = true;
+
+    const auto search_count = KE::distance(m_s_first, m_s_last);
+    for (IndexType k = 0; k < search_count; ++k) {
+      // note that we add this EXPECT to check if we are in a valid range
+      // but I think we can remove this beceause the guarantee we don't go
+      // out of bounds is taken care of at the calling site
+      // where we launch the par-reduce.
+      KOKKOS_EXPECTS((myit + k) < m_last);
+
+      if (!m_p(myit[k], m_s_first[k])) {
+        found = false;
+        break;
+      }
+    }
+
+    const auto rv =
+        found ? red_value_type{i}
+              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdSearchFunctor(IteratorType1 first, IteratorType1 last,
+                   IteratorType2 s_first, IteratorType2 s_last,
+                   ReducerType reducer, PredicateType p)
+      : m_first(std::move(first)),
+        m_last(std::move(last)),
+        m_s_first(std::move(s_first)),
+        m_s_last(std::move(s_last)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class BinaryPredicateType>
+IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
+                          IteratorType1 first, IteratorType1 last,
+                          IteratorType2 s_first, IteratorType2 s_last,
+                          const BinaryPredicateType& pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, s_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, s_first);
+  Impl::expect_valid_range(first, last);
+  Impl::expect_valid_range(s_first, s_last);
+
+  // the target sequence should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  const auto s_count      = KE::distance(s_first, s_last);
+  KOKKOS_EXPECTS(num_elements >= s_count);
+  (void)s_count;  // needed when macro above is a no-op
+
+  if (s_first == s_last) {
+    return first;
+  }
+
+  if (first == last) {
+    return last;
+  }
+
+  // special case where the two ranges have equal size
+  if (num_elements == s_count) {
+    const auto equal_result = equal_impl(label, ex, first, last, s_first, pred);
+    return (equal_result) ? first : last;
+  } else {
+    using index_type           = typename IteratorType1::difference_type;
+    using reducer_type         = FirstLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t = StdSearchFunctor<index_type, IteratorType1, IteratorType2,
+                                    reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // note that the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to the sequence count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - s_count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(
+        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
+        func_t(first, last, s_first, s_last, reducer, pred), reducer);
+
+    // fence not needed because reducing into scalar
+
+    // decide and return
+    if (red_result.min_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::min()) {
+      // location has not been found
+      return last;
+    } else {
+      // location has been found
+      return first + red_result.min_loc_true;
+    }
+  }
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType1 search_impl(const std::string& label, const ExecutionSpace& ex,
+                          IteratorType1 first, IteratorType1 last,
+                          IteratorType2 s_first, IteratorType2 s_last) {
+  using value_type1    = typename IteratorType1::value_type;
+  using value_type2    = typename IteratorType2::value_type;
+  using predicate_type = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+  return search_impl(label, ex, first, last, s_first, s_last, predicate_type());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
new file mode 100644
index 000000000..cd8b39438
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SearchN.hpp
@@ -0,0 +1,205 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SEARCH_N_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_SEARCH_N_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_AllOfAnyOfNoneOf.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType, class SizeType, class ValueType,
+          class ReducerType, class PredicateType>
+struct StdSearchNFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  IteratorType m_first;
+  IteratorType m_last;
+  SizeType m_count;
+  ValueType m_value;
+  ReducerType m_reducer;
+  PredicateType m_p;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    namespace KE = ::Kokkos::Experimental;
+    auto myit    = m_first + i;
+    bool found   = true;
+
+    for (SizeType k = 0; k < m_count; ++k) {
+      // note that we add this EXPECT to check if we are in a valid range
+      // but I think we can remove this beceause the guarantee we don't go
+      // out of bounds is taken care of at the calling site
+      // where we launch the par-reduce.
+      KOKKOS_EXPECTS((myit + k) < m_last);
+
+      if (!m_p(myit[k], m_value)) {
+        found = false;
+        break;
+      }
+    }
+
+    const auto rv =
+        found ? red_value_type{i}
+              : red_value_type{::Kokkos::reduction_identity<IndexType>::min()};
+
+    m_reducer.join(red_value, rv);
+  }
+
+  KOKKOS_FUNCTION
+  StdSearchNFunctor(IteratorType first, IteratorType last, SizeType count,
+                    ValueType value, ReducerType reducer, PredicateType p)
+      : m_first(std::move(first)),
+        m_last(std::move(last)),
+        m_count(std::move(count)),
+        m_value(std::move(value)),
+        m_reducer(std::move(reducer)),
+        m_p(std::move(p)) {}
+};
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType, class BinaryPredicateType>
+IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last,
+                           SizeType count, const ValueType& value,
+                           const BinaryPredicateType& pred) {
+  // checks
+  static_assert_random_access_and_accessible(ex, first);
+  expect_valid_range(first, last);
+  KOKKOS_EXPECTS((std::ptrdiff_t)count >= 0);
+
+  // count should not be larger than the range [first, last)
+  namespace KE            = ::Kokkos::Experimental;
+  const auto num_elements = KE::distance(first, last);
+  // cast things to avoid compiler warning
+  KOKKOS_EXPECTS((std::size_t)num_elements >= (std::size_t)count);
+
+  if (first == last) {
+    return first;
+  }
+
+  // special case where num elements in [first, last) == count
+  if ((std::size_t)num_elements == (std::size_t)count) {
+    using equal_to_value = StdAlgoEqualsValUnaryPredicate<ValueType>;
+    const auto satisfies =
+        all_of_impl(label, ex, first, last, equal_to_value(value));
+    return (satisfies) ? first : last;
+  } else {
+    // aliases
+    using index_type           = typename IteratorType::difference_type;
+    using reducer_type         = FirstLoc<index_type>;
+    using reduction_value_type = typename reducer_type::value_type;
+    using func_t =
+        StdSearchNFunctor<index_type, IteratorType, SizeType, ValueType,
+                          reducer_type, BinaryPredicateType>;
+
+    // run
+    reduction_value_type red_result;
+    reducer_type reducer(red_result);
+
+    // decide the size of the range policy of the par_red:
+    // the last feasible index to start looking is the index
+    // whose distance from the "last" is equal to count.
+    // the +1 is because we need to include that location too.
+    const auto range_size = num_elements - count + 1;
+
+    // run par reduce
+    ::Kokkos::parallel_reduce(
+        label, RangePolicy<ExecutionSpace>(ex, 0, range_size),
+        func_t(first, last, count, value, reducer, pred), reducer);
+
+    // fence not needed because reducing into scalar
+
+    // decide and return
+    if (red_result.min_loc_true ==
+        ::Kokkos::reduction_identity<index_type>::min()) {
+      // location has not been found
+      return last;
+    } else {
+      // location has been found
+      return first + red_result.min_loc_true;
+    }
+  }
+}
+
+template <class ExecutionSpace, class IteratorType, class SizeType,
+          class ValueType>
+IteratorType search_n_impl(const std::string& label, const ExecutionSpace& ex,
+                           IteratorType first, IteratorType last,
+                           SizeType count, const ValueType& value) {
+  using iter_value_type = typename IteratorType::value_type;
+  using predicate_type =
+      StdAlgoEqualBinaryPredicate<iter_value_type, ValueType>;
+
+  /* above we use <iter_value_type, ValueType> for the predicate_type
+     to be consistent with the standard, which says:
+
+     "
+     The signature of the predicate function should be equivalent to:
+
+        bool pred(const Type1 &a, const Type2 &b);
+
+     The type Type1 must be such that an object of type ForwardIt can be
+     dereferenced and then implicitly converted to Type1. The type Type2 must be
+     such that an object of type T can be implicitly converted to Type2.
+     "
+
+     In our case, IteratorType = ForwardIt, and ValueType = T.
+   */
+
+  return search_n_impl(label, ex, first, last, count, value, predicate_type());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
new file mode 100644
index 000000000..796864461
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp
@@ -0,0 +1,139 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SHIFT_LEFT_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_SHIFT_LEFT_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Move.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_left_impl(const std::string& label, const ExecutionSpace& ex,
+                             IteratorType first, IteratorType last,
+                             typename IteratorType::difference_type n) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+  KOKKOS_EXPECTS(n >= 0);
+
+  // handle trivial cases
+  if (n == 0) {
+    return last;
+  }
+
+  if (n >= Kokkos::Experimental::distance(first, last)) {
+    return first;
+  }
+
+  /*
+    Suppose that n = 5, and our [first,last) spans:
+
+    | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | -3 | 1  | -6 | *
+      ^                         				  ^
+    first							 last
+
+    shift_left modifies the range such that we have this data:
+    | 1  | 2  | 2  | 10  | -3 | 1  | -6 | x | x  | x  | x  |  x | *
+                                          ^
+                                   return it pointing here
+
+
+    and returns an iterator pointing to one past the new end.
+    Note: elements marked x are in undefined state because have been moved.
+
+    We implement this in two steps:
+    step 1:
+      we create a temporary view with extent = distance(first+n, last)
+      and *move* assign the elements from [first+n, last) to tmp view, such that
+      tmp view becomes:
+
+      | 1  | 2  | 2  | 10  | -3 | 1  | -6 |
+
+    step 2:
+      move elements of tmp view back to range starting at first.
+   */
+
+  const auto num_elements_to_move =
+      ::Kokkos::Experimental::distance(first + n, last);
+
+  // create tmp view
+  using value_type    = typename IteratorType::value_type;
+  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+  tmp_view_type tmp_view("shift_left_impl", num_elements_to_move);
+  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+
+  using index_type = typename IteratorType::difference_type;
+
+  // step 1
+  using step1_func_type =
+      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_move),
+      step1_func_type(first + n, begin(tmp_view)));
+
+  // step 2
+  using step2_func_type =
+      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+                         step2_func_type(begin(tmp_view), first));
+
+  ex.fence("Kokkos::shift_left: fence after operation");
+
+  return last - n;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
new file mode 100644
index 000000000..0ffde42ab
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp
@@ -0,0 +1,139 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SHIFT_RIGHT_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_SHIFT_RIGHT_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Move.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType shift_right_impl(const std::string& label,
+                              const ExecutionSpace& ex, IteratorType first,
+                              IteratorType last,
+                              typename IteratorType::difference_type n) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+  KOKKOS_EXPECTS(n >= 0);
+
+  // handle trivial cases
+  if (n == 0) {
+    return first;
+  }
+
+  if (n >= Kokkos::Experimental::distance(first, last)) {
+    return last;
+  }
+
+  /*
+    Suppose that n = 3, and [first,last) spans:
+
+    | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | -3 | 1  | -6 | *
+      ^                         				  ^
+    first							 last
+
+    shift_right modifies the range such that we have this data:
+    |  x | x  | x  | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | *
+                     ^
+             return it points here
+
+    and returns an iterator pointing to the new beginning.
+    Note: elements marked x are in undefined state because have been moved.
+
+    We implement this in two steps:
+    step 1:
+      we create a temporary view with extent = distance(first, last-n)
+      and *move* assign the elements from [first, last-n) to tmp view, such that
+      tmp view becomes:
+
+      | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 |
+
+    step 2:
+      move elements of tmp view back to range starting at first+n.
+   */
+
+  const auto num_elements_to_move =
+      ::Kokkos::Experimental::distance(first, last - n);
+
+  // create tmp view
+  using value_type    = typename IteratorType::value_type;
+  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+  tmp_view_type tmp_view("shift_right_impl", num_elements_to_move);
+  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+
+  using index_type = typename IteratorType::difference_type;
+
+  // step 1
+  using step1_func_type =
+      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_move),
+      step1_func_type(first, begin(tmp_view)));
+
+  // step 2
+  using step2_func_type =
+      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+                         step2_func_type(begin(tmp_view), first + n));
+
+  ex.fence("Kokkos::shift_right: fence after operation");
+
+  return first + n;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
new file mode 100644
index 000000000..3e6ca1469
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_SWAP_RANGES_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_SWAP_RANGES_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <std_algorithms/Kokkos_Swap.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class IteratorType1, class IteratorType2>
+struct StdSwapRangesFunctor {
+  IteratorType1 m_first1;
+  IteratorType2 m_first2;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    // the swap below is doing the same thing, but
+    // for Intel 18.0.5 does not work.
+    // But putting the impl directly here, it works.
+#ifdef KOKKOS_COMPILER_INTEL
+    typename IteratorType1::value_type tmp = std::move(m_first1[i]);
+    m_first1[i]                            = std::move(m_first2[i]);
+    m_first2[i]                            = std::move(tmp);
+#else
+    ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]);
+#endif
+  }
+
+  KOKKOS_FUNCTION
+  StdSwapRangesFunctor(IteratorType1 _first1, IteratorType2 _first2)
+      : m_first1(std::move(_first1)), m_first2(std::move(_first2)) {}
+};
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2>
+IteratorType2 swap_ranges_impl(const std::string& label,
+                               const ExecutionSpace& ex, IteratorType1 first1,
+                               IteratorType1 last1, IteratorType2 first2) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using func_t = StdSwapRangesFunctor<index_type, IteratorType1, IteratorType2>;
+
+  // run
+  const auto num_elements_to_swap =
+      Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_swap),
+      func_t(first1, first2));
+  ex.fence("Kokkos::swap_ranges: fence after operation");
+
+  // return
+  return first2 + num_elements_to_swap;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
new file mode 100644
index 000000000..5d2c0cc98
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Transform.hpp
@@ -0,0 +1,158 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIterator, class OutputIterator,
+          class UnaryFunctorType>
+struct StdTransformFunctor {
+  InputIterator m_first;
+  OutputIterator m_d_first;
+  UnaryFunctorType m_unary_op;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const { m_d_first[i] = m_unary_op(m_first[i]); }
+
+  KOKKOS_FUNCTION
+  StdTransformFunctor(InputIterator _first, OutputIterator _m_d_first,
+                      UnaryFunctorType _functor)
+      : m_first(std::move(_first)),
+        m_d_first(std::move(_m_d_first)),
+        m_unary_op(std::move(_functor)) {}
+};
+
+template <class IndexType, class InputIterator1, class InputIterator2,
+          class OutputIterator, class BinaryFunctorType>
+struct StdTransformBinaryFunctor {
+  InputIterator1 m_first1;
+  InputIterator2 m_first2;
+  OutputIterator m_d_first;
+  BinaryFunctorType m_binary_op;
+
+  KOKKOS_FUNCTION
+  void operator()(IndexType i) const {
+    m_d_first[i] = m_binary_op(m_first1[i], m_first2[i]);
+  }
+
+  KOKKOS_FUNCTION
+  StdTransformBinaryFunctor(InputIterator1 _first1, InputIterator2 _first2,
+                            OutputIterator _m_d_first,
+                            BinaryFunctorType _functor)
+      : m_first1(std::move(_first1)),
+        m_first2(std::move(_first2)),
+        m_d_first(std::move(_m_d_first)),
+        m_binary_op(std::move(_functor)) {}
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class UnaryOperation>
+OutputIterator transform_impl(const std::string& label,
+                              const ExecutionSpace& ex, InputIterator first1,
+                              InputIterator last1, OutputIterator d_first,
+                              UnaryOperation unary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, d_first);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using index_type = typename InputIterator::difference_type;
+  using func_t = StdTransformFunctor<index_type, InputIterator, OutputIterator,
+                                     UnaryOperation>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first1, d_first, unary_op));
+  ex.fence("Kokkos::transform: fence after operation");
+
+  // return
+  return d_first + num_elements;
+}
+
+template <class ExecutionSpace, class InputIterator1, class InputIterator2,
+          class OutputIterator, class BinaryOperation>
+OutputIterator transform_impl(const std::string& label,
+                              const ExecutionSpace& ex, InputIterator1 first1,
+                              InputIterator1 last1, InputIterator2 first2,
+                              OutputIterator d_first,
+                              BinaryOperation binary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2,
+                                                              d_first);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using index_type = typename InputIterator1::difference_type;
+  using func_t =
+      StdTransformBinaryFunctor<index_type, InputIterator1, InputIterator2,
+                                OutputIterator, BinaryOperation>;
+
+  // run
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_for(label,
+                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                         func_t(first1, first2, d_first, binary_op));
+  ex.fence("Kokkos::transform: fence after operation");
+  return d_first + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
new file mode 100644
index 000000000..9fb8cbcc3
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformExclusiveScan.hpp
@@ -0,0 +1,153 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_EXCLUSIVE_SCAN_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_EXCLUSIVE_SCAN_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest, class BinaryOpType, class UnaryOpType>
+struct TransformExclusiveScanFunctor {
+  using execution_space = ExeSpace;
+  using value_type =
+      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
+
+  ValueType m_init_value;
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+
+  KOKKOS_FUNCTION
+  TransformExclusiveScanFunctor(ValueType init, FirstFrom first_from,
+                                FirstDest first_dest, BinaryOpType bop,
+                                UnaryOpType uop)
+      : m_init_value(std::move(init)),
+        m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    if (final_pass) {
+      if (i == 0) {
+        // for both ExclusiveScan and TransformExclusiveScan,
+        // init is unmodified
+        m_first_dest[i] = m_init_value;
+      } else {
+        m_first_dest[i] = m_binary_op(update.val, m_init_value);
+      }
+    }
+
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
+    this->join(update, tmp);
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (update.is_initial) {
+      update.val = input.val;
+    } else {
+      update.val = m_binary_op(update.val, input.val);
+    }
+    update.is_initial = false;
+  }
+};
+
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class ValueType, class BinaryOpType,
+          class UnaryOpType>
+OutputIteratorType transform_exclusive_scan_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop,
+    UnaryOpType uop) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type =
+      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
+                                    InputIteratorType, OutputIteratorType,
+                                    BinaryOpType, UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(init_value, first_from, first_dest, bop, uop));
+  ex.fence("Kokkos::transform_exclusive_scan: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
new file mode 100644
index 000000000..281eb6f8a
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformInclusiveScan.hpp
@@ -0,0 +1,235 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_INCLUSIVE_SCAN_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_INCLUSIVE_SCAN_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_ValueWrapperForNoNeutralElement.hpp"
+#include "Kokkos_IdentityReferenceUnaryFunctor.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest, class BinaryOpType, class UnaryOpType>
+struct TransformInclusiveScanNoInitValueFunctor {
+  using execution_space = ExeSpace;
+  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+
+  KOKKOS_FUNCTION
+  TransformInclusiveScanNoInitValueFunctor(FirstFrom first_from,
+                                           FirstDest first_dest,
+                                           BinaryOpType bop, UnaryOpType uop)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
+    this->join(update, tmp);
+    if (final_pass) {
+      m_first_dest[i] = update.val;
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (update.is_initial) {
+      update.val = input.val;
+    } else {
+      update.val = m_binary_op(update.val, input.val);
+    }
+    update.is_initial = false;
+  }
+};
+
+template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
+          class FirstDest, class BinaryOpType, class UnaryOpType>
+struct TransformInclusiveScanWithInitValueFunctor {
+  using execution_space = ExeSpace;
+  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
+
+  FirstFrom m_first_from;
+  FirstDest m_first_dest;
+  BinaryOpType m_binary_op;
+  UnaryOpType m_unary_op;
+  ValueType m_init;
+
+  KOKKOS_FUNCTION
+  TransformInclusiveScanWithInitValueFunctor(FirstFrom first_from,
+                                             FirstDest first_dest,
+                                             BinaryOpType bop, UnaryOpType uop,
+                                             ValueType init)
+      : m_first_from(std::move(first_from)),
+        m_first_dest(std::move(first_dest)),
+        m_binary_op(std::move(bop)),
+        m_unary_op(std::move(uop)),
+        m_init(std::move(init)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, value_type& update,
+                  const bool final_pass) const {
+    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
+    this->join(update, tmp);
+
+    if (final_pass) {
+      m_first_dest[i] = m_binary_op(update.val, m_init);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void init(value_type& update) const {
+    update.val        = {};
+    update.is_initial = true;
+  }
+
+  KOKKOS_FUNCTION
+  void join(value_type& update, const value_type& input) const {
+    if (update.is_initial) {
+      update.val = input.val;
+    } else {
+      update.val = m_binary_op(update.val, input.val);
+    }
+    update.is_initial = false;
+  }
+};
+
+// -------------------------------------------------------------
+// transform_inclusive_scan_impl without init_value
+// -------------------------------------------------------------
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
+OutputIteratorType transform_inclusive_scan_impl(const std::string& label,
+                                                 const ExecutionSpace& ex,
+                                                 InputIteratorType first_from,
+                                                 InputIteratorType last_from,
+                                                 OutputIteratorType first_dest,
+                                                 BinaryOpType binary_op,
+                                                 UnaryOpType unary_op) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using value_type =
+      std::remove_const_t<typename InputIteratorType::value_type>;
+  using func_type = TransformInclusiveScanNoInitValueFunctor<
+      ExecutionSpace, index_type, value_type, InputIteratorType,
+      OutputIteratorType, BinaryOpType, UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(first_from, first_dest, binary_op, unary_op));
+  ex.fence("Kokkos::transform_inclusive_scan: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+// -------------------------------------------------------------
+// transform_inclusive_scan_impl with init_value
+// -------------------------------------------------------------
+template <class ExecutionSpace, class InputIteratorType,
+          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
+          class ValueType>
+OutputIteratorType transform_inclusive_scan_impl(
+    const std::string& label, const ExecutionSpace& ex,
+    InputIteratorType first_from, InputIteratorType last_from,
+    OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op,
+    ValueType init_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
+  Impl::static_assert_iterators_have_matching_difference_type(first_from,
+                                                              first_dest);
+  Impl::expect_valid_range(first_from, last_from);
+
+  // aliases
+  using index_type = typename InputIteratorType::difference_type;
+  using func_type  = TransformInclusiveScanWithInitValueFunctor<
+      ExecutionSpace, index_type, ValueType, InputIteratorType,
+      OutputIteratorType, BinaryOpType, UnaryOpType>;
+
+  // run
+  const auto num_elements =
+      Kokkos::Experimental::distance(first_from, last_from);
+  ::Kokkos::parallel_scan(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      func_type(first_from, first_dest, binary_op, unary_op, init_value));
+  ex.fence("Kokkos::transform_inclusive_scan: fence after operation");
+
+  // return
+  return first_dest + num_elements;
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
new file mode 100644
index 000000000..e3a780f48
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp
@@ -0,0 +1,245 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_TRANSFORM_REDUCE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_TRANSFORM_REDUCE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class ValueType>
+struct StdTranformReduceDefaultBinaryTransformFunctor {
+  KOKKOS_FUNCTION
+  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return (a * b);
+  }
+};
+
+template <class ValueType>
+struct StdTranformReduceDefaultJoinFunctor {
+  KOKKOS_FUNCTION
+  constexpr ValueType operator()(const ValueType& a, const ValueType& b) const {
+    return a + b;
+  }
+};
+
+template <class IteratorType, class ReducerType, class TransformType>
+struct StdTransformReduceSingleIntervalFunctor {
+  using red_value_type = typename ReducerType::value_type;
+  using index_type     = typename IteratorType::difference_type;
+
+  const IteratorType m_first;
+  const ReducerType m_reducer;
+  const TransformType m_transform;
+
+  KOKKOS_FUNCTION
+  void operator()(const index_type i, red_value_type& red_value) const {
+    auto tmp_wrapped_value = red_value_type{m_transform(m_first[i]), false};
+    if (red_value.is_initial) {
+      red_value = tmp_wrapped_value;
+    } else {
+      m_reducer.join(red_value, tmp_wrapped_value);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdTransformReduceSingleIntervalFunctor(IteratorType first,
+                                          ReducerType reducer,
+                                          TransformType transform)
+      : m_first(std::move(first)),
+        m_reducer(std::move(reducer)),
+        m_transform(std::move(transform)) {}
+};
+
+template <class IndexType, class IteratorType1, class IteratorType2,
+          class ReducerType, class TransformType>
+struct StdTransformReduceTwoIntervalsFunctor {
+  using red_value_type = typename ReducerType::value_type;
+
+  const IteratorType1 m_first1;
+  const IteratorType2 m_first2;
+  const ReducerType m_reducer;
+  const TransformType m_transform;
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, red_value_type& red_value) const {
+    auto tmp_wrapped_value =
+        red_value_type{m_transform(m_first1[i], m_first2[i]), false};
+
+    if (red_value.is_initial) {
+      red_value = tmp_wrapped_value;
+    } else {
+      m_reducer.join(red_value, tmp_wrapped_value);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  StdTransformReduceTwoIntervalsFunctor(IteratorType1 first1,
+                                        IteratorType2 first2,
+                                        ReducerType reducer,
+                                        TransformType transform)
+      : m_first1(std::move(first1)),
+        m_first2(std::move(first2)),
+        m_reducer(std::move(reducer)),
+        m_transform(std::move(transform)) {}
+};
+
+//------------------------------
+//
+// impl functions
+//
+//------------------------------
+
+template <class ExecutionSpace, class IteratorType, class ValueType,
+          class JoinerType, class UnaryTransformerType>
+ValueType transform_reduce_custom_functors_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType first,
+    IteratorType last, ValueType init_reduction_value, JoinerType joiner,
+    UnaryTransformerType transformer) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::expect_valid_range(first, last);
+
+  if (first == last) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using functor_type =
+      StdTransformReduceSingleIntervalFunctor<IteratorType, reducer_type,
+                                              UnaryTransformerType>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  ::Kokkos::parallel_reduce(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+                            functor_type(first, reducer, transformer), reducer);
+
+  // fence not needed since reducing into scalar
+
+  // as per standard, transform is not applied to the init value
+  // https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+  return joiner(result.val, init_reduction_value);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class ValueType, class JoinerType, class BinaryTransformerType>
+ValueType transform_reduce_custom_functors_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value,
+    JoinerType joiner, BinaryTransformerType transformer) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  if (first1 == last1) {
+    // init is returned, unmodified
+    return init_reduction_value;
+  }
+
+  // aliases
+  using index_type = typename IteratorType1::difference_type;
+  using reducer_type =
+      ReducerWithArbitraryJoinerNoNeutralElement<ValueType, JoinerType>;
+  using functor_type =
+      StdTransformReduceTwoIntervalsFunctor<index_type, IteratorType1,
+                                            IteratorType2, reducer_type,
+                                            BinaryTransformerType>;
+  using reduction_value_type = typename reducer_type::value_type;
+
+  // run
+  reduction_value_type result;
+  reducer_type reducer(result, joiner);
+
+  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
+  ::Kokkos::parallel_reduce(
+      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
+      functor_type(first1, first2, reducer, transformer), reducer);
+
+  // fence not needed since reducing into scalar
+  return joiner(result.val, init_reduction_value);
+}
+
+template <class ExecutionSpace, class IteratorType1, class IteratorType2,
+          class ValueType>
+ValueType transform_reduce_default_functors_impl(
+    const std::string& label, const ExecutionSpace& ex, IteratorType1 first1,
+    IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
+  Impl::static_assert_is_not_openmptarget(ex);
+  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
+  Impl::expect_valid_range(first1, last1);
+
+  // aliases
+  using transformer_type =
+      Impl::StdTranformReduceDefaultBinaryTransformFunctor<ValueType>;
+  using joiner_type = Impl::StdTranformReduceDefaultJoinFunctor<ValueType>;
+
+  return transform_reduce_custom_functors_impl(
+      label, ex, first1, last1, first2, std::move(init_reduction_value),
+      joiner_type(), transformer_type());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
new file mode 100644
index 000000000..5e4ea7d79
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp
@@ -0,0 +1,193 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_UNIQUE_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_UNIQUE_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include <std_algorithms/Kokkos_Move.hpp>
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <std_algorithms/Kokkos_AdjacentFind.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIt, class OutputIt,
+          class BinaryPredicateType>
+struct StdUniqueFunctor {
+  InputIt m_first_from;
+  InputIt m_last_from;
+  OutputIt m_first_dest;
+  BinaryPredicateType m_pred;
+
+  KOKKOS_FUNCTION
+  StdUniqueFunctor(InputIt first_from, InputIt last_from, OutputIt first_dest,
+                   BinaryPredicateType pred)
+      : m_first_from(std::move(first_from)),
+        m_last_from(std::move(last_from)),
+        m_first_dest(std::move(first_dest)),
+        m_pred(std::move(pred)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, IndexType& update,
+                  const bool final_pass) const {
+    auto& val_i         = m_first_from[i];
+    const auto& val_ip1 = m_first_from[i + 1];
+
+    if (final_pass) {
+      if (!m_pred(val_i, val_ip1)) {
+        m_first_dest[update] = std::move(val_i);
+      }
+    }
+
+    if (!m_pred(val_i, val_ip1)) {
+      update += 1;
+    }
+  }
+};
+
+template <class ExecutionSpace, class IteratorType, class PredicateType>
+IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last,
+                         PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first);
+  Impl::expect_valid_range(first, last);
+
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements == 0) {
+    return first;
+  } else if (num_elements == 1) {
+    return last;
+  } else {
+    // ----------
+    // step 1:
+    // find first location of adjacent equal elements
+    // ----------
+    auto it_found =
+        ::Kokkos::Experimental::adjacent_find(ex, first, last, pred);
+
+    // if none, all elements are unique, so nothing to do
+    if (it_found == last) {
+      return last;
+    } else {
+      // if here, we found some equal adjacent elements,
+      // so count all preceeding unique elements
+      const auto num_unique_found_in_step_one = it_found - first;
+
+      // ----------
+      // step 2:
+      // ----------
+      // since we found some unique elements, we don't need to explore
+      // the full range [first, last), but only need to focus on the
+      // remaining range [it_found, last)
+      const auto num_elements_to_explore = last - it_found;
+
+      // create a tmp view to use to *move* all unique elements
+      // using the same algorithm used for unique_copy but we now move things
+      using value_type    = typename IteratorType::value_type;
+      using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
+      tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore);
+
+      // scan extent is: num_elements_to_explore - 1
+      // for same reason as the one explained in unique_copy
+      const auto scan_size = num_elements_to_explore - 1;
+      auto tmp_first       = ::Kokkos::Experimental::begin(tmp_view);
+      using output_it      = decltype(tmp_first);
+
+      using index_type = typename IteratorType::difference_type;
+      using func_type =
+          StdUniqueFunctor<index_type, IteratorType, output_it, PredicateType>;
+      index_type count = 0;
+      ::Kokkos::parallel_scan(
+          label, RangePolicy<ExecutionSpace>(ex, 0, scan_size),
+          func_type(it_found, last, tmp_first, pred), count);
+
+      // move last element too, for the same reason as the unique_copy
+      auto unused_r =
+          Impl::move_impl("Kokkos::move_from_unique", ex, it_found + scan_size,
+                          last, tmp_first + count);
+      (void)unused_r;  // r1 not used
+
+      // ----------
+      // step 3
+      // ----------
+      // move back from tmp to original range,
+      // ensuring we start overwriting after the original unique found
+      using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
+      using step3_func_t =
+          StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
+
+      ::Kokkos::parallel_for(
+          "unique_step3_parfor",
+          RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
+          step3_func_t(begin(tmp_view),
+                       (first + num_unique_found_in_step_one)));
+
+      ex.fence("Kokkos::unique: fence after operation");
+
+      // return iterator to one passed the last written
+      // (the +1 is needed to account for the last element, see above)
+      return (first + num_unique_found_in_step_one + count + 1);
+    }
+  }
+}
+
+template <class ExecutionSpace, class IteratorType>
+IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
+                         IteratorType first, IteratorType last) {
+  using value_type    = typename IteratorType::value_type;
+  using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
+  return unique_impl(label, ex, first, last, binary_pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
new file mode 100644
index 000000000..e4fd6f3ed
--- /dev/null
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STD_ALGORITHMS_UNIQUE_COPY_IMPL_HPP
+#define KOKKOS_STD_ALGORITHMS_UNIQUE_COPY_IMPL_HPP
+
+#include <Kokkos_Core.hpp>
+#include "Kokkos_Constraints.hpp"
+#include "Kokkos_HelperPredicates.hpp"
+#include "Kokkos_CopyCopyN.hpp"
+#include <std_algorithms/Kokkos_Distance.hpp>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template <class IndexType, class InputIt, class OutputIt,
+          class BinaryPredicateType>
+struct StdUniqueCopyFunctor {
+  InputIt m_first_from;
+  InputIt m_last_from;
+  OutputIt m_first_dest;
+  BinaryPredicateType m_pred;
+
+  KOKKOS_FUNCTION
+  StdUniqueCopyFunctor(InputIt first_from, InputIt last_from,
+                       OutputIt first_dest, BinaryPredicateType pred)
+      : m_first_from(std::move(first_from)),
+        m_last_from(std::move(last_from)),
+        m_first_dest(std::move(first_dest)),
+        m_pred(std::move(pred)) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const IndexType i, IndexType& update,
+                  const bool final_pass) const {
+    const auto& val_i   = m_first_from[i];
+    const auto& val_ip1 = m_first_from[i + 1];
+
+    if (final_pass) {
+      if (!m_pred(val_i, val_ip1)) {
+        m_first_dest[update] = val_i;
+      }
+    }
+
+    if (!m_pred(val_i, val_ip1)) {
+      update += 1;
+    }
+  }
+};
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator,
+          class PredicateType>
+OutputIterator unique_copy_impl(const std::string& label,
+                                const ExecutionSpace& ex, InputIterator first,
+                                InputIterator last, OutputIterator d_first,
+                                PredicateType pred) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // branch for trivial vs non trivial case
+  const auto num_elements = Kokkos::Experimental::distance(first, last);
+  if (num_elements == 0) {
+    return d_first;
+  } else if (num_elements == 1) {
+    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex, first, last,
+                           d_first);
+  } else {
+    // aliases
+    using index_type = typename InputIterator::difference_type;
+    using func_type  = StdUniqueCopyFunctor<index_type, InputIterator,
+                                           OutputIterator, PredicateType>;
+
+    // note here that we run scan for num_elements - 1
+    // because of the way we implement this, the last element is always needed.
+    // We avoid performing checks inside functor that we are within limits
+    // and run a "safe" scan and then copy the last element.
+    const auto scan_size = num_elements - 1;
+    index_type count     = 0;
+    ::Kokkos::parallel_scan(label,
+                            RangePolicy<ExecutionSpace>(ex, 0, scan_size),
+                            func_type(first, last, d_first, pred), count);
+
+    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex,
+                           first + scan_size, last, d_first + count);
+  }
+}
+
+template <class ExecutionSpace, class InputIterator, class OutputIterator>
+OutputIterator unique_copy_impl(const std::string& label,
+                                const ExecutionSpace& ex, InputIterator first,
+                                InputIterator last, OutputIterator d_first) {
+  // checks
+  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
+  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
+  Impl::expect_valid_range(first, last);
+
+  // aliases
+  using value_type1 = typename InputIterator::value_type;
+  using value_type2 = typename OutputIterator::value_type;
+
+  // default binary predicate uses ==
+  using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
+
+  // run
+  return unique_copy_impl(label, ex, first, last, d_first, binary_pred_t());
+}
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ValueWrapperForNoNeutralElement.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
similarity index 89%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_ValueWrapperForNoNeutralElement.hpp
rename to packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
index 019a0049d..da9b6ef9a 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ValueWrapperForNoNeutralElement.hpp
+++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ValueWrapperForNoNeutralElement.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_VALUE_WRAPPER_FOR_NO_NEUTRAL_ELEMENT_HPP
-#define KOKKOS_STD_VALUE_WRAPPER_FOR_NO_NEUTRAL_ELEMENT_HPP
+#ifndef KOKKOS_STD_ALGORITHMS_VALUE_WRAPPER_FOR_NO_NEUTRAL_ELEMENT_HPP
+#define KOKKOS_STD_ALGORITHMS_VALUE_WRAPPER_FOR_NO_NEUTRAL_ELEMENT_HPP
 
 namespace Kokkos {
 namespace Experimental {
@@ -63,12 +63,6 @@ struct ValueWrapperForNoNeutralElement {
     val        = rhs.val;
     is_initial = rhs.is_initial;
   }
-
-  KOKKOS_FUNCTION
-  void operator=(const volatile ValueWrapperForNoNeutralElement& rhs) volatile {
-    val        = rhs.val;
-    is_initial = rhs.is_initial;
-  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet1.hpp b/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet1.hpp
deleted file mode 100644
index b9b1b96ae..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet1.hpp
+++ /dev/null
@@ -1,1285 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MODIFYING_SEQUENCE_OPERATIONS_SET1_HPP
-#define KOKKOS_MODIFYING_SEQUENCE_OPERATIONS_SET1_HPP
-
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_NonModifyingSequenceOperations.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-//---------------------------
-//
-// functors
-//
-//---------------------------
-template <class IndexType, class InputIterator, class OutputIterator>
-struct StdCopyFunctor {
-  InputIterator m_first;
-  OutputIterator m_dest_first;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_first[i] = m_first[i]; }
-
-  KOKKOS_FUNCTION
-  StdCopyFunctor(InputIterator _first, OutputIterator _dest_first)
-      : m_first(std::move(_first)), m_dest_first(std::move(_dest_first)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2>
-struct StdCopyBackwardFunctor {
-  static_assert(std::is_signed<IndexType>::value,
-                "Kokkos: StdCopyBackwardFunctor requires signed index type");
-
-  IteratorType1 m_last;
-  IteratorType2 m_dest_last;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_last[-i - 1] = m_last[-i - 1]; }
-
-  KOKKOS_FUNCTION
-  StdCopyBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
-      : m_last(std::move(_last)), m_dest_last(std::move(_dest_last)) {}
-};
-
-template <class IndexType, class FirstFrom, class FirstDest, class PredType>
-struct StdCopyIfFunctor {
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  PredType m_pred;
-
-  KOKKOS_FUNCTION
-  StdCopyIfFunctor(FirstFrom first_from, FirstDest first_dest, PredType pred)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_pred(std::move(pred)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
-                  const bool final_pass) const {
-    const auto& myval = m_first_from[i];
-    if (final_pass) {
-      if (m_pred(myval)) {
-        m_first_dest[update] = myval;
-      }
-    }
-
-    if (m_pred(myval)) {
-      update += 1;
-    }
-  }
-};
-
-template <class InputIterator, class T>
-struct StdFillFunctor {
-  using index_type = typename InputIterator::difference_type;
-  InputIterator m_first;
-  T m_value;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const { m_first[i] = m_value; }
-
-  KOKKOS_FUNCTION
-  StdFillFunctor(InputIterator _first, T _value)
-      : m_first(std::move(_first)), m_value(std::move(_value)) {}
-};
-
-template <class IndexType, class InputIterator, class OutputIterator,
-          class UnaryFunctorType>
-struct StdTransformFunctor {
-  InputIterator m_first;
-  OutputIterator m_d_first;
-  UnaryFunctorType m_unary_op;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_d_first[i] = m_unary_op(m_first[i]); }
-
-  KOKKOS_FUNCTION
-  StdTransformFunctor(InputIterator _first, OutputIterator _m_d_first,
-                      UnaryFunctorType _functor)
-      : m_first(std::move(_first)),
-        m_d_first(std::move(_m_d_first)),
-        m_unary_op(std::move(_functor)) {}
-};
-
-template <class IndexType, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryFunctorType>
-struct StdTransformBinaryFunctor {
-  InputIterator1 m_first1;
-  InputIterator2 m_first2;
-  OutputIterator m_d_first;
-  BinaryFunctorType m_binary_op;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    m_d_first[i] = m_binary_op(m_first1[i], m_first2[i]);
-  }
-
-  KOKKOS_FUNCTION
-  StdTransformBinaryFunctor(InputIterator1 _first1, InputIterator2 _first2,
-                            OutputIterator _m_d_first,
-                            BinaryFunctorType _functor)
-      : m_first1(std::move(_first1)),
-        m_first2(std::move(_first2)),
-        m_d_first(std::move(_m_d_first)),
-        m_binary_op(std::move(_functor)) {}
-};
-
-template <class IteratorType, class Generator>
-struct StdGenerateFunctor {
-  using index_type = typename IteratorType::difference_type;
-  IteratorType m_first;
-  Generator m_generator;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const { m_first[i] = m_generator(); }
-
-  KOKKOS_FUNCTION
-  StdGenerateFunctor(IteratorType _first, Generator _g)
-      : m_first(std::move(_first)), m_generator(std::move(_g)) {}
-};
-
-template <class InputIterator, class PredicateType, class NewValueType>
-struct StdReplaceIfFunctor {
-  using index_type = typename InputIterator::difference_type;
-
-  InputIterator m_first;
-  PredicateType m_predicate;
-  NewValueType m_new_value;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const {
-    if (m_predicate(m_first[i])) {
-      m_first[i] = m_new_value;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdReplaceIfFunctor(InputIterator first, PredicateType pred,
-                      NewValueType new_value)
-      : m_first(std::move(first)),
-        m_predicate(std::move(pred)),
-        m_new_value(std::move(new_value)) {}
-};
-
-template <class InputIterator, class ValueType>
-struct StdReplaceFunctor {
-  using index_type = typename InputIterator::difference_type;
-  InputIterator m_first;
-  ValueType m_old_value;
-  ValueType m_new_value;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const {
-    if (m_first[i] == m_old_value) {
-      m_first[i] = m_new_value;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdReplaceFunctor(InputIterator first, ValueType old_value,
-                    ValueType new_value)
-      : m_first(std::move(first)),
-        m_old_value(std::move(old_value)),
-        m_new_value(std::move(new_value)) {}
-};
-
-template <class InputIterator, class OutputIterator, class ValueType>
-struct StdReplaceCopyFunctor {
-  using index_type = typename InputIterator::difference_type;
-
-  InputIterator m_first_from;
-  OutputIterator m_first_dest;
-  ValueType m_old_value;
-  ValueType m_new_value;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const {
-    const auto& myvalue_from = m_first_from[i];
-
-    if (myvalue_from == m_old_value) {
-      m_first_dest[i] = m_new_value;
-    } else {
-      m_first_dest[i] = myvalue_from;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdReplaceCopyFunctor(InputIterator first_from, OutputIterator first_dest,
-                        ValueType old_value, ValueType new_value)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_old_value(std::move(old_value)),
-        m_new_value(std::move(new_value)) {}
-};
-
-template <class IndexType, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
-struct StdReplaceIfCopyFunctor {
-  InputIterator m_first_from;
-  OutputIterator m_first_dest;
-  PredicateType m_pred;
-  ValueType m_new_value;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    const auto& myvalue_from = m_first_from[i];
-
-    if (m_pred(myvalue_from)) {
-      m_first_dest[i] = m_new_value;
-    } else {
-      m_first_dest[i] = myvalue_from;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  StdReplaceIfCopyFunctor(InputIterator first_from, OutputIterator first_dest,
-                          PredicateType pred, ValueType new_value)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_pred(std::move(pred)),
-        m_new_value(std::move(new_value)) {}
-};
-
-// ------------------------------------------
-// copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIterator first, InputIterator last,
-                         OutputIterator d_first) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t     = StdCopyFunctor<index_type, InputIterator, OutputIterator>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, d_first));
-  ex.fence("Kokkos::copy: fence after operation");
-
-  // return
-  return d_first + num_elements;
-}
-
-// ------------------------------------------
-// copy_n_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class Size,
-          class OutputIterator>
-OutputIterator copy_n_impl(const std::string& label, const ExecutionSpace& ex,
-                           InputIterator first_from, Size count,
-                           OutputIterator first_dest) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-
-  if (count > 0) {
-    return copy_impl(label, ex, first_from, first_from + count, first_dest);
-  } else {
-    return first_dest;
-  }
-}
-
-// ------------------------------------------
-// copy_backward_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 copy_backward_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 d_last) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_last);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t =
-      StdCopyBackwardFunctor<index_type, IteratorType1, IteratorType2>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_last));
-  ex.fence("Kokkos::copy_backward: fence after operation");
-
-  // return
-  return d_last - num_elements;
-}
-
-// ------------------------------------------
-// copy_if_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType>
-OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex,
-                            InputIterator first, InputIterator last,
-                            OutputIterator d_first, PredicateType pred) {
-  /*
-    To explain the impl, suppose that our data is:
-
-    | 1 | 1 | 2 | 2 | 3 | -2 | 4 | 4 | 4 | 5 | 7 | -10 |
-
-    and we want to copy only the even entries,
-    We can use an exclusive scan where the "update"
-    is incremented only for the elements that satisfy the predicate.
-    This way, the update allows us to track where in the destination
-    we need to copy the elements:
-
-    In this case, counting only the even entries, the exlusive scan
-    during the final pass would yield:
-
-    | 0 | 0 | 0 | 1 | 2 | 2 | 3 | 4 | 5 | 6 | 6 | 6 |
-              *   *       *   *   *   *           *
-
-    which provides the indexing in the destination where
-    each starred (*) element needs to be copied to since
-    the starred elements are those that satisfy the predicate.
-   */
-
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return d_first;
-  } else {
-    // aliases
-    using index_type = typename InputIterator::difference_type;
-    using func_type  = StdCopyIfFunctor<index_type, InputIterator,
-                                       OutputIterator, PredicateType>;
-
-    // run
-    const auto num_elements = Kokkos::Experimental::distance(first, last);
-    index_type count        = 0;
-    ::Kokkos::parallel_scan(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                            func_type(first, d_first, pred), count);
-
-    // fence not needed because of the scan accumulating into count
-    return d_first + count;
-  }
-}
-
-// ------------------------------------------
-// fill_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class T>
-void fill_impl(const std::string& label, const ExecutionSpace& ex,
-               IteratorType first, IteratorType last, const T& value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         StdFillFunctor<IteratorType, T>(first, value));
-  ex.fence("Kokkos::fill: fence after operation");
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType, class T>
-IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, SizeType n, const T& value) {
-  auto last = first + n;
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  if (n <= 0) {
-    return first;
-  }
-
-  fill_impl(label, ex, first, last, value);
-  return last;
-}
-
-// ------------------------------------------
-// transform_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryOperation>
-OutputIterator transform_impl(const std::string& label,
-                              const ExecutionSpace& ex, InputIterator first1,
-                              InputIterator last1, OutputIterator d_first,
-                              UnaryOperation unary_op) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, d_first);
-  Impl::expect_valid_range(first1, last1);
-
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t = StdTransformFunctor<index_type, InputIterator, OutputIterator,
-                                     UnaryOperation>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first1, d_first, unary_op));
-  ex.fence("Kokkos::transform: fence after operation");
-
-  // return
-  return d_first + num_elements;
-}
-
-template <class ExecutionSpace, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryOperation>
-OutputIterator transform_impl(const std::string& label,
-                              const ExecutionSpace& ex, InputIterator1 first1,
-                              InputIterator1 last1, InputIterator2 first2,
-                              OutputIterator d_first,
-                              BinaryOperation binary_op) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2,
-                                                              d_first);
-  Impl::expect_valid_range(first1, last1);
-
-  // aliases
-  using index_type = typename InputIterator1::difference_type;
-  using func_t =
-      StdTransformBinaryFunctor<index_type, InputIterator1, InputIterator2,
-                                OutputIterator, BinaryOperation>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first1, last1);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first1, first2, d_first, binary_op));
-  ex.fence("Kokkos::transform: fence after operation");
-  return d_first + num_elements;
-}
-
-// ------------------------------------------
-// generate_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class Generator>
-void generate_impl(const std::string& label, const ExecutionSpace& ex,
-                   IteratorType first, IteratorType last, Generator g) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using func_t = StdGenerateFunctor<IteratorType, Generator>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, g));
-  ex.fence("Kokkos::generate: fence after operation");
-}
-
-template <class ExecutionSpace, class IteratorType, class Size, class Generator>
-IteratorType generate_n_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, Size count, Generator g) {
-  if (count <= 0) {
-    return first;
-  }
-
-  generate_impl(label, ex, first, first + count, g);
-  return first + count;
-}
-
-// ------------------------------------------
-// replace_if_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType,
-          class ValueType>
-void replace_if_impl(const std::string& label, const ExecutionSpace& ex,
-                     IteratorType first, IteratorType last, PredicateType pred,
-                     const ValueType& new_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using func_t = StdReplaceIfFunctor<IteratorType, PredicateType, ValueType>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, std::move(pred), new_value));
-  ex.fence("Kokkos::replace_if: fence after operation");
-}
-
-// ------------------------------------------
-// replace_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class ValueType>
-void replace_impl(const std::string& label, const ExecutionSpace& ex,
-                  IteratorType first, IteratorType last,
-                  const ValueType& old_value, const ValueType& new_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using func_t = StdReplaceFunctor<IteratorType, ValueType>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, old_value, new_value));
-  ex.fence("Kokkos::replace: fence after operation");
-}
-
-// ------------------------------------------
-// replace_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-OutputIteratorType replace_copy_impl(const std::string& label,
-                                     const ExecutionSpace& ex,
-                                     InputIteratorType first_from,
-                                     InputIteratorType last_from,
-                                     OutputIteratorType first_dest,
-                                     const ValueType& old_value,
-                                     const ValueType& new_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using func_t =
-      StdReplaceCopyFunctor<InputIteratorType, OutputIteratorType, ValueType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first_from, first_dest, old_value, new_value));
-  ex.fence("Kokkos::replace_copy: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// ------------------------------------------
-// replace_copy_if_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class PredicateType, class ValueType>
-OutputIteratorType replace_copy_if_impl(const std::string& label,
-                                        const ExecutionSpace& ex,
-                                        InputIteratorType first_from,
-                                        InputIteratorType last_from,
-                                        OutputIteratorType first_dest,
-                                        PredicateType pred,
-                                        const ValueType& new_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_t =
-      StdReplaceIfCopyFunctor<index_type, InputIteratorType, OutputIteratorType,
-                              PredicateType, ValueType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(first_from, first_dest, std::move(pred), new_value));
-  ex.fence("Kokkos::replace_copy_if: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-}  // namespace Impl
-
-// -------------------
-// replace copy
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
-OutputIterator replace_copy(const ExecutionSpace& ex, InputIterator first_from,
-                            InputIterator last_from, OutputIterator first_dest,
-                            const ValueType& old_value,
-                            const ValueType& new_value) {
-  return Impl::replace_copy_impl("Kokkos::replace_copy_iterator_api", ex,
-                                 first_from, last_from, first_dest, old_value,
-                                 new_value);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
-OutputIterator replace_copy(const std::string& label, const ExecutionSpace& ex,
-                            InputIterator first_from, InputIterator last_from,
-                            OutputIterator first_dest,
-                            const ValueType& old_value,
-                            const ValueType& new_value) {
-  return Impl::replace_copy_impl(label, ex, first_from, last_from, first_dest,
-                                 old_value, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto replace_copy(const ExecutionSpace& ex,
-                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                  const ValueType& old_value, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_impl("Kokkos::replace_copy_view_api", ex,
-                                 KE::cbegin(view_from), KE::cend(view_from),
-                                 KE::begin(view_dest), old_value, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto replace_copy(const std::string& label, const ExecutionSpace& ex,
-                  const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                  const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                  const ValueType& old_value, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_impl(label, ex, KE::cbegin(view_from),
-                                 KE::cend(view_from), KE::begin(view_dest),
-                                 old_value, new_value);
-}
-
-// -------------------
-// replace_copy_if
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
-OutputIterator replace_copy_if(const ExecutionSpace& ex,
-                               InputIterator first_from,
-                               InputIterator last_from,
-                               OutputIterator first_dest, PredicateType pred,
-                               const ValueType& new_value) {
-  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_iterator_api", ex,
-                                    first_from, last_from, first_dest, pred,
-                                    new_value);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType, class ValueType>
-OutputIterator replace_copy_if(const std::string& label,
-                               const ExecutionSpace& ex,
-                               InputIterator first_from,
-                               InputIterator last_from,
-                               OutputIterator first_dest, PredicateType pred,
-                               const ValueType& new_value) {
-  return Impl::replace_copy_if_impl(label, ex, first_from, last_from,
-                                    first_dest, pred, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class PredicateType,
-          class ValueType>
-auto replace_copy_if(const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                     PredicateType pred, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_view_api", ex,
-                                    KE::cbegin(view_from), KE::cend(view_from),
-                                    KE::begin(view_dest), pred, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class PredicateType,
-          class ValueType>
-auto replace_copy_if(const std::string& label, const ExecutionSpace& ex,
-                     const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                     const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                     PredicateType pred, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_copy_if_impl(label, ex, KE::cbegin(view_from),
-                                    KE::cend(view_from), KE::begin(view_dest),
-                                    pred, new_value);
-}
-
-// -------------------
-// replace
-// -------------------
-template <class ExecutionSpace, class Iterator, class ValueType>
-void replace(const ExecutionSpace& ex, Iterator first, Iterator last,
-             const ValueType& old_value, const ValueType& new_value) {
-  return Impl::replace_impl("Kokkos::replace_iterator_api", ex, first, last,
-                            old_value, new_value);
-}
-
-template <class ExecutionSpace, class Iterator, class ValueType>
-void replace(const std::string& label, const ExecutionSpace& ex, Iterator first,
-             Iterator last, const ValueType& old_value,
-             const ValueType& new_value) {
-  return Impl::replace_impl(label, ex, first, last, old_value, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class ValueType>
-void replace(const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType1, Properties1...>& view,
-             const ValueType& old_value, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_impl("Kokkos::replace_view_api", ex, KE::begin(view),
-                            KE::end(view), old_value, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class ValueType>
-void replace(const std::string& label, const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType1, Properties1...>& view,
-             const ValueType& old_value, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_impl(label, ex, KE::begin(view), KE::end(view),
-                            old_value, new_value);
-}
-
-// -------------------
-// replace_if
-// -------------------
-template <class ExecutionSpace, class InputIterator, class Predicate,
-          class ValueType>
-void replace_if(const ExecutionSpace& ex, InputIterator first,
-                InputIterator last, Predicate pred,
-                const ValueType& new_value) {
-  return Impl::replace_if_impl("Kokkos::replace_if_iterator_api", ex, first,
-                               last, pred, new_value);
-}
-
-template <class ExecutionSpace, class InputIterator, class Predicate,
-          class ValueType>
-void replace_if(const std::string& label, const ExecutionSpace& ex,
-                InputIterator first, InputIterator last, Predicate pred,
-                const ValueType& new_value) {
-  return Impl::replace_if_impl(label, ex, first, last, pred, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Predicate, class ValueType>
-void replace_if(const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType1, Properties1...>& view,
-                Predicate pred, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_if_impl("Kokkos::replace_if_view_api", ex,
-                               KE::begin(view), KE::end(view), pred, new_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Predicate, class ValueType>
-void replace_if(const std::string& label, const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType1, Properties1...>& view,
-                Predicate pred, const ValueType& new_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::replace_if_impl(label, ex, KE::begin(view), KE::end(view), pred,
-                               new_value);
-}
-
-// -------------------
-// copy
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator copy(const ExecutionSpace& ex, InputIterator first,
-                    InputIterator last, OutputIterator d_first) {
-  return Impl::copy_impl("Kokkos::copy_iterator_api_default", ex, first, last,
-                         d_first);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator copy(const std::string& label, const ExecutionSpace& ex,
-                    InputIterator first, InputIterator last,
-                    OutputIterator d_first) {
-  return Impl::copy_impl(label, ex, first, last, d_first);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto copy(const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_impl("Kokkos::copy_view_api_default", ex,
-                         KE::cbegin(source), KE::cend(source), KE::begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto copy(const std::string& label, const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_impl(label, ex, KE::cbegin(source), KE::cend(source),
-                         KE::begin(dest));
-}
-
-// -------------------
-// copy_n
-// -------------------
-template <class ExecutionSpace, class InputIterator, class Size,
-          class OutputIterator>
-OutputIterator copy_n(const ExecutionSpace& ex, InputIterator first, Size count,
-                      OutputIterator result) {
-  return Impl::copy_n_impl("Kokkos::copy_n_iterator_api_default", ex, first,
-                           count, result);
-}
-
-template <class ExecutionSpace, class InputIterator, class Size,
-          class OutputIterator>
-OutputIterator copy_n(const std::string& label, const ExecutionSpace& ex,
-                      InputIterator first, Size count, OutputIterator result) {
-  return Impl::copy_n_impl(label, ex, first, count, result);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Size, class DataType2, class... Properties2>
-auto copy_n(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_n_impl("Kokkos::copy_n_view_api_default", ex,
-                           KE::cbegin(source), count, KE::begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class Size, class DataType2, class... Properties2>
-auto copy_n(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType1, Properties1...>& source, Size count,
-            ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::copy_n_impl(label, ex, KE::cbegin(source), count,
-                           KE::begin(dest));
-}
-
-// -------------------
-// copy_backward
-// -------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 copy_backward(const ExecutionSpace& ex, IteratorType1 first,
-                            IteratorType1 last, IteratorType2 d_last) {
-  return Impl::copy_backward_impl("Kokkos::copy_backward_iterator_api_default",
-                                  ex, first, last, d_last);
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 copy_backward(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 d_last) {
-  return Impl::copy_backward_impl(label, ex, first, last, d_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto copy_backward(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::copy_backward_impl("Kokkos::copy_backward_view_api_default", ex,
-                                  cbegin(source), cend(source), end(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto copy_backward(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::copy_backward_impl(label, ex, cbegin(source), cend(source),
-                                  end(dest));
-}
-
-// -------------------
-// copy_if
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class Predicate>
-OutputIterator copy_if(const ExecutionSpace& ex, InputIterator first,
-                       InputIterator last, OutputIterator d_first,
-                       Predicate pred) {
-  return Impl::copy_if_impl("Kokkos::copy_if_iterator_api_default", ex, first,
-                            last, d_first, std::move(pred));
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class Predicate>
-OutputIterator copy_if(const std::string& label, const ExecutionSpace& ex,
-                       InputIterator first, InputIterator last,
-                       OutputIterator d_first, Predicate pred) {
-  return Impl::copy_if_impl(label, ex, first, last, d_first, std::move(pred));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class Predicate>
-auto copy_if(const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::copy_if_impl("Kokkos::copy_if_view_api_default", ex,
-                            cbegin(source), cend(source), begin(dest),
-                            std::move(pred));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class Predicate>
-auto copy_if(const std::string& label, const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType1, Properties1...>& source,
-             ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::copy_if_impl(label, ex, cbegin(source), cend(source),
-                            begin(dest), std::move(pred));
-}
-
-// -------------------
-// fill
-// -------------------
-template <class ExecutionSpace, class IteratorType, class T>
-void fill(const ExecutionSpace& ex, IteratorType first, IteratorType last,
-          const T& value) {
-  Impl::fill_impl("Kokkos::fill_iterator_api_default", ex, first, last, value);
-}
-
-template <class ExecutionSpace, class IteratorType, class T>
-void fill(const std::string& label, const ExecutionSpace& ex,
-          IteratorType first, IteratorType last, const T& value) {
-  Impl::fill_impl(label, ex, first, last, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-void fill(const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  Impl::fill_impl("Kokkos::fill_view_api_default", ex, begin(view), end(view),
-                  value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class T>
-void fill(const std::string& label, const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType, Properties...>& view, const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  Impl::fill_impl(label, ex, begin(view), end(view), value);
-}
-
-// -------------------
-// fill_n
-// -------------------
-template <class ExecutionSpace, class IteratorType, class SizeType, class T>
-IteratorType fill_n(const ExecutionSpace& ex, IteratorType first, SizeType n,
-                    const T& value) {
-  return Impl::fill_n_impl("Kokkos::fill_n_iterator_api_default", ex, first, n,
-                           value);
-}
-
-template <class ExecutionSpace, class IteratorType, class SizeType, class T>
-IteratorType fill_n(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, SizeType n, const T& value) {
-  return Impl::fill_n_impl(label, ex, first, n, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class T>
-auto fill_n(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
-            const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  return Impl::fill_n_impl("Kokkos::fill_n_view_api_default", ex, begin(view),
-                           n, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class SizeType, class T>
-auto fill_n(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view, SizeType n,
-            const T& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  return Impl::fill_n_impl(label, ex, begin(view), n, value);
-}
-
-// -------------------
-// transform
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator, OutputIterator>::value,
-                  OutputIterator>
-transform(const ExecutionSpace& ex, InputIterator first1, InputIterator last1,
-          OutputIterator d_first, UnaryOperation unary_op) {
-  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
-                              first1, last1, d_first, std::move(unary_op));
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator, OutputIterator>::value,
-                  OutputIterator>
-transform(const std::string& label, const ExecutionSpace& ex,
-          InputIterator first1, InputIterator last1, OutputIterator d_first,
-          UnaryOperation unary_op) {
-  return Impl::transform_impl(label, ex, first1, last1, d_first,
-                              std::move(unary_op));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryOperation>
-auto transform(const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
-               UnaryOperation unary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
-                              begin(source), end(source), begin(dest),
-                              std::move(unary_op));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryOperation>
-auto transform(const std::string& label, const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType1, Properties1...>& source,
-               ::Kokkos::View<DataType2, Properties2...>& dest,
-               UnaryOperation unary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::transform_impl(label, ex, begin(source), end(source),
-                              begin(dest), std::move(unary_op));
-}
-
-template <class ExecutionSpace, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator1, InputIterator2, OutputIterator>::value,
-                  OutputIterator>
-transform(const ExecutionSpace& ex, InputIterator1 first1, InputIterator1 last1,
-          InputIterator2 first2, OutputIterator d_first,
-          BinaryOperation binary_op) {
-  return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex,
-                              first1, last1, first2, d_first,
-                              std::move(binary_op));
-}
-
-template <class ExecutionSpace, class InputIterator1, class InputIterator2,
-          class OutputIterator, class BinaryOperation>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIterator1, InputIterator2, OutputIterator>::value,
-                  OutputIterator>
-transform(const std::string& label, const ExecutionSpace& ex,
-          InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
-          OutputIterator d_first, BinaryOperation binary_op) {
-  return Impl::transform_impl(label, ex, first1, last1, first2, d_first,
-                              std::move(binary_op));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class BinaryOperation>
-auto transform(const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType1, Properties1...>& source1,
-               const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
-               BinaryOperation binary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::transform_impl("Kokkos::transform_view_api_default", ex,
-                              begin(source1), end(source1), begin(source2),
-                              begin(dest), std::move(binary_op));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class DataType3,
-          class... Properties3, class BinaryOperation>
-auto transform(const std::string& label, const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType1, Properties1...>& source1,
-               const ::Kokkos::View<DataType2, Properties2...>& source2,
-               ::Kokkos::View<DataType3, Properties3...>& dest,
-               BinaryOperation binary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::transform_impl(label, ex, begin(source1), end(source1),
-                              begin(source2), begin(dest),
-                              std::move(binary_op));
-}
-
-// -------------------
-// generate
-// -------------------
-template <class ExecutionSpace, class IteratorType, class Generator>
-void generate(const ExecutionSpace& ex, IteratorType first, IteratorType last,
-              Generator g) {
-  Impl::generate_impl("Kokkos::generate_iterator_api_default", ex, first, last,
-                      std::move(g));
-}
-
-template <class ExecutionSpace, class IteratorType, class Generator>
-void generate(const std::string& label, const ExecutionSpace& ex,
-              IteratorType first, IteratorType last, Generator g) {
-  Impl::generate_impl(label, ex, first, last, std::move(g));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Generator>
-void generate(const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              Generator g) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  Impl::generate_impl("Kokkos::generate_view_api_default", ex, begin(view),
-                      end(view), std::move(g));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class Generator>
-void generate(const std::string& label, const ExecutionSpace& ex,
-              const ::Kokkos::View<DataType, Properties...>& view,
-              Generator g) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  Impl::generate_impl(label, ex, begin(view), end(view), std::move(g));
-}
-
-// -------------------
-// generate_n
-// -------------------
-template <class ExecutionSpace, class IteratorType, class Size, class Generator>
-IteratorType generate_n(const ExecutionSpace& ex, IteratorType first,
-                        Size count, Generator g) {
-  Impl::generate_n_impl("Kokkos::generate_n_iterator_api_default", ex, first,
-                        count, std::move(g));
-  return first + count;
-}
-
-template <class ExecutionSpace, class IteratorType, class Size, class Generator>
-IteratorType generate_n(const std::string& label, const ExecutionSpace& ex,
-                        IteratorType first, Size count, Generator g) {
-  Impl::generate_n_impl(label, ex, first, count, std::move(g));
-  return first + count;
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class Size,
-          class Generator>
-auto generate_n(const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& view, Size count,
-                Generator g) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  return Impl::generate_n_impl("Kokkos::generate_n_view_api_default", ex,
-                               begin(view), count, std::move(g));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties, class Size,
-          class Generator>
-auto generate_n(const std::string& label, const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& view, Size count,
-                Generator g) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  return Impl::generate_n_impl(label, ex, begin(view), count, std::move(g));
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet2.hpp b/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet2.hpp
deleted file mode 100644
index 9d2c85f00..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet2.hpp
+++ /dev/null
@@ -1,1783 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MODIFYING_SEQUENCE_OPERATIONS_SET2_HPP
-#define KOKKOS_MODIFYING_SEQUENCE_OPERATIONS_SET2_HPP
-
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_NonModifyingSequenceOperations.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-//-------------------------
-//
-// functors
-//
-//-------------------------
-
-template <class IndexType, class InputIt, class OutputIt,
-          class BinaryPredicateType>
-struct StdUniqueCopyFunctor {
-  InputIt m_first_from;
-  InputIt m_last_from;
-  OutputIt m_first_dest;
-  BinaryPredicateType m_pred;
-
-  KOKKOS_FUNCTION
-  StdUniqueCopyFunctor(InputIt first_from, InputIt last_from,
-                       OutputIt first_dest, BinaryPredicateType pred)
-      : m_first_from(std::move(first_from)),
-        m_last_from(std::move(last_from)),
-        m_first_dest(std::move(first_dest)),
-        m_pred(std::move(pred)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
-                  const bool final_pass) const {
-    const auto& val_i   = m_first_from[i];
-    const auto& val_ip1 = m_first_from[i + 1];
-
-    if (final_pass) {
-      if (!m_pred(val_i, val_ip1)) {
-        m_first_dest[update] = val_i;
-      }
-    }
-
-    if (!m_pred(val_i, val_ip1)) {
-      update += 1;
-    }
-  }
-};
-
-template <class InputIterator>
-struct StdReverseFunctor {
-  using index_type = typename InputIterator::difference_type;
-  static_assert(std::is_signed<index_type>::value,
-                "Kokkos: StdReverseFunctor requires signed index type");
-
-  InputIterator m_first;
-  InputIterator m_last;
-
-  KOKKOS_FUNCTION
-  void operator()(index_type i) const {
-    // the swap below is doing the same thing, but
-    // for Intel 18.0.5 does not work.
-    // But putting the impl directly here, it works.
-#ifdef KOKKOS_COMPILER_INTEL
-    typename InputIterator::value_type tmp = std::move(m_first[i]);
-    m_first[i]                             = std::move(m_last[-i - 1]);
-    m_last[-i - 1]                         = std::move(tmp);
-#else
-    ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]);
-#endif
-  }
-
-  StdReverseFunctor(InputIterator first, InputIterator last)
-      : m_first(std::move(first)), m_last(std::move(last)) {}
-};
-
-template <class IndexType, class InputIterator, class OutputIterator>
-struct StdReverseCopyFunctor {
-  static_assert(std::is_signed<IndexType>::value,
-                "Kokkos: StdReverseCopyFunctor requires signed index type");
-
-  InputIterator m_last;
-  OutputIterator m_dest_first;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const { m_dest_first[i] = m_last[-1 - i]; }
-
-  StdReverseCopyFunctor(InputIterator _last, OutputIterator _dest_first)
-      : m_last(std::move(_last)), m_dest_first(std::move(_dest_first)) {}
-};
-
-template <class IndexType, class InputIterator, class OutputIterator>
-struct StdMoveFunctor {
-  InputIterator m_first;
-  OutputIterator m_dest_first;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    m_dest_first[i] = std::move(m_first[i]);
-  }
-
-  StdMoveFunctor(InputIterator _first, OutputIterator _dest_first)
-      : m_first(std::move(_first)), m_dest_first(std::move(_dest_first)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2>
-struct StdMoveBackwardFunctor {
-  static_assert(std::is_signed<IndexType>::value,
-                "Kokkos: StdMoveBackwardFunctor requires signed index type");
-
-  IteratorType1 m_last;
-  IteratorType2 m_dest_last;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    m_dest_last[-i] = std::move(m_last[-i]);
-  }
-
-  StdMoveBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last)
-      : m_last(std::move(_last)), m_dest_last(std::move(_dest_last)) {}
-};
-
-template <class IndexType, class IteratorType1, class IteratorType2>
-struct StdSwapRangesFunctor {
-  IteratorType1 m_first1;
-  IteratorType2 m_first2;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    // the swap below is doing the same thing, but
-    // for Intel 18.0.5 does not work.
-    // But putting the impl directly here, it works.
-#ifdef KOKKOS_COMPILER_INTEL
-    typename IteratorType1::value_type tmp = std::move(m_first1[i]);
-    m_first1[i]                            = std::move(m_first2[i]);
-    m_first2[i]                            = std::move(tmp);
-#else
-    ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]);
-#endif
-  }
-
-  KOKKOS_FUNCTION
-  StdSwapRangesFunctor(IteratorType1 _first1, IteratorType2 _first2)
-      : m_first1(std::move(_first1)), m_first2(std::move(_first2)) {}
-};
-
-template <class IndexType, class InputIt, class OutputIt,
-          class BinaryPredicateType>
-struct StdUniqueFunctor {
-  InputIt m_first_from;
-  InputIt m_last_from;
-  OutputIt m_first_dest;
-  BinaryPredicateType m_pred;
-
-  KOKKOS_FUNCTION
-  StdUniqueFunctor(InputIt first_from, InputIt last_from, OutputIt first_dest,
-                   BinaryPredicateType pred)
-      : m_first_from(std::move(first_from)),
-        m_last_from(std::move(last_from)),
-        m_first_dest(std::move(first_dest)),
-        m_pred(std::move(pred)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
-                  const bool final_pass) const {
-    auto& val_i         = m_first_from[i];
-    const auto& val_ip1 = m_first_from[i + 1];
-
-    if (final_pass) {
-      if (!m_pred(val_i, val_ip1)) {
-        m_first_dest[update] = std::move(val_i);
-      }
-    }
-
-    if (!m_pred(val_i, val_ip1)) {
-      update += 1;
-    }
-  }
-};
-
-template <class IndexType, class InputIterator, class OutputIterator>
-struct StdRotateCopyFunctor {
-  InputIterator m_first;
-  InputIterator m_last;
-  InputIterator m_first_n;
-  OutputIterator m_dest_first;
-
-  KOKKOS_FUNCTION
-  void operator()(IndexType i) const {
-    const IndexType shift = m_last - m_first_n;
-
-    if (i < shift) {
-      m_dest_first[i] = m_first_n[i];
-    } else {
-      m_dest_first[i] = m_first[i - shift];
-    }
-  }
-
-  StdRotateCopyFunctor(InputIterator first, InputIterator last,
-                       InputIterator first_n, OutputIterator dest_first)
-      : m_first(std::move(first)),
-        m_last(std::move(last)),
-        m_first_n(std::move(first_n)),
-        m_dest_first(std::move(dest_first)) {}
-};
-
-template <class IndexType, class FirstFrom, class FirstDest, class PredType>
-struct StdRemoveIfStage1Functor {
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  PredType m_must_remove;
-
-  KOKKOS_FUNCTION
-  StdRemoveIfStage1Functor(FirstFrom first_from, FirstDest first_dest,
-                           PredType pred)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_must_remove(std::move(pred)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, IndexType& update,
-                  const bool final_pass) const {
-    auto& myval = m_first_from[i];
-    if (final_pass) {
-      if (!m_must_remove(myval)) {
-        // calling move here is ok because we are inside final pass
-        // we are calling move assign as specified by the std
-        m_first_dest[update] = std::move(myval);
-      }
-    }
-
-    if (!m_must_remove(myval)) {
-      update += 1;
-    }
-  }
-};
-
-template <class IndexType, class InputIteratorType, class OutputIteratorType>
-struct StdRemoveIfStage2Functor {
-  InputIteratorType m_first_from;
-  OutputIteratorType m_first_to;
-
-  KOKKOS_FUNCTION
-  StdRemoveIfStage2Functor(InputIteratorType first_from,
-                           OutputIteratorType first_to)
-      : m_first_from(std::move(first_from)), m_first_to(std::move(first_to)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i) const {
-    m_first_to[i] = std::move(m_first_from[i]);
-  }
-};
-
-// ------------------------------------------
-// unique_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class PredicateType>
-OutputIterator unique_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator last, OutputIterator d_first,
-                                PredicateType pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  // branch for trivial vs non trivial case
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  if (num_elements == 0) {
-    return d_first;
-  } else if (num_elements == 1) {
-    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex, first, last,
-                           d_first);
-  } else {
-    // aliases
-    using index_type = typename InputIterator::difference_type;
-    using func_type  = StdUniqueCopyFunctor<index_type, InputIterator,
-                                           OutputIterator, PredicateType>;
-
-    // note here that we run scan for num_elements - 1
-    // because of the way we implement this, the last element is always needed.
-    // We avoid performing checks inside functor that we are within limits
-    // and run a "safe" scan and then copy the last element.
-    const auto scan_size = num_elements - 1;
-    index_type count     = 0;
-    ::Kokkos::parallel_scan(label,
-                            RangePolicy<ExecutionSpace>(ex, 0, scan_size),
-                            func_type(first, last, d_first, pred), count);
-
-    return Impl::copy_impl("Kokkos::copy_from_unique_copy", ex,
-                           first + scan_size, last, d_first + count);
-  }
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator unique_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator last, OutputIterator d_first) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using value_type1 = typename InputIterator::value_type;
-  using value_type2 = typename OutputIterator::value_type;
-
-  // default binary predicate uses ==
-  using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type1, value_type2>;
-
-  // run
-  return unique_copy_impl(label, ex, first, last, d_first, binary_pred_t());
-}
-
-// ------------------------------------------
-// reverse_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator>
-void reverse_impl(const std::string& label, const ExecutionSpace& ex,
-                  InputIterator first, InputIterator last) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using func_t = StdReverseFunctor<InputIterator>;
-
-  // run
-  if (last >= first + 2) {
-    // only need half
-    const auto num_elements = Kokkos::Experimental::distance(first, last) / 2;
-    ::Kokkos::parallel_for(label,
-                           RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                           func_t(first, last));
-    ex.fence("Kokkos::reverse: fence after operation");
-  }
-}
-
-// ------------------------------------------
-// reverse_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator reverse_copy_impl(const std::string& label,
-                                 const ExecutionSpace& ex, InputIterator first,
-                                 InputIterator last, OutputIterator d_first) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t =
-      StdReverseCopyFunctor<index_type, InputIterator, OutputIterator>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_first));
-  ex.fence("Kokkos::reverse_copy: fence after operation");
-
-  // return
-  return d_first + num_elements;
-}
-
-// ------------------------------------------
-// move_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator move_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIterator first, InputIterator last,
-                         OutputIterator d_first) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_t     = StdMoveFunctor<index_type, InputIterator, OutputIterator>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(first, d_first));
-  ex.fence("Kokkos::move: fence after operation");
-
-  // return
-  return d_first + num_elements;
-}
-
-// ------------------------------------------
-// move_backward_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 move_backward_impl(const std::string& label,
-                                 const ExecutionSpace& ex, IteratorType1 first,
-                                 IteratorType1 last, IteratorType2 d_last) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_last);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_last);
-  Impl::expect_valid_range(first, last);
-
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t =
-      StdMoveBackwardFunctor<index_type, IteratorType1, IteratorType2>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_t(last, d_last));
-  ex.fence("Kokkos::move_backward: fence after operation");
-
-  // return
-  return d_last - num_elements;
-}
-
-// ------------------------------------------
-// swap_ranges_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 swap_ranges_impl(const std::string& label,
-                               const ExecutionSpace& ex, IteratorType1 first1,
-                               IteratorType1 last1, IteratorType2 first2) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first1, first2);
-  Impl::static_assert_iterators_have_matching_difference_type(first1, first2);
-  Impl::expect_valid_range(first1, last1);
-
-  // aliases
-  using index_type = typename IteratorType1::difference_type;
-  using func_t = StdSwapRangesFunctor<index_type, IteratorType1, IteratorType2>;
-
-  // run
-  const auto num_elements_to_swap =
-      Kokkos::Experimental::distance(first1, last1);
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_swap),
-      func_t(first1, first2));
-  ex.fence("Kokkos::swap_ranges: fence after operation");
-
-  // return
-  return first2 + num_elements_to_swap;
-}
-
-// ------------------------------------------
-// unique_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class PredicateType>
-IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         PredicateType pred) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  if (num_elements == 0) {
-    return first;
-  } else if (num_elements == 1) {
-    return last;
-  } else {
-    // ----------
-    // step 1:
-    // find first location of adjacent equal elements
-    // ----------
-    auto it_found =
-        ::Kokkos::Experimental::adjacent_find(ex, first, last, pred);
-
-    // if none, all elements are unique, so nothing to do
-    if (it_found == last) {
-      return last;
-    } else {
-      // if here, we found some equal adjacent elements,
-      // so count all preceeding unique elements
-      const auto num_unique_found_in_step_one = it_found - first;
-
-      // ----------
-      // step 2:
-      // ----------
-      // since we found some unique elements, we don't need to explore
-      // the full range [first, last), but only need to focus on the
-      // remaining range [it_found, last)
-      const auto num_elements_to_explore = last - it_found;
-
-      // create a tmp view to use to *move* all unique elements
-      // using the same algorithm used for unique_copy but we now move things
-      using value_type    = typename IteratorType::value_type;
-      using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-      tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore);
-
-      // scan extent is: num_elements_to_explore - 1
-      // for same reason as the one explained in unique_copy
-      const auto scan_size = num_elements_to_explore - 1;
-      auto tmp_first       = ::Kokkos::Experimental::begin(tmp_view);
-      using output_it      = decltype(tmp_first);
-
-      using index_type = typename IteratorType::difference_type;
-      using func_type =
-          StdUniqueFunctor<index_type, IteratorType, output_it, PredicateType>;
-      index_type count = 0;
-      ::Kokkos::parallel_scan(
-          label, RangePolicy<ExecutionSpace>(ex, 0, scan_size),
-          func_type(it_found, last, tmp_first, pred), count);
-
-      // move last element too, for the same reason as the unique_copy
-      auto unused_r =
-          Impl::move_impl("Kokkos::move_from_unique", ex, it_found + scan_size,
-                          last, tmp_first + count);
-      (void)unused_r;  // r1 not used
-
-      // ----------
-      // step 3
-      // ----------
-      // move back from tmp to original range,
-      // ensuring we start overwriting after the original unique found
-      using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-      using step3_func_t =
-          StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
-
-      ::Kokkos::parallel_for(
-          "unique_step3_parfor",
-          RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-          step3_func_t(begin(tmp_view),
-                       (first + num_unique_found_in_step_one)));
-
-      ex.fence("Kokkos::unique: fence after operation");
-
-      // return iterator to one passed the last written
-      // (the +1 is needed to account for the last element, see above)
-      return (first + num_unique_found_in_step_one + count + 1);
-    }
-  }
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType unique_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last) {
-  using value_type    = typename IteratorType::value_type;
-  using binary_pred_t = StdAlgoEqualBinaryPredicate<value_type>;
-  return unique_impl(label, ex, first, last, binary_pred_t());
-}
-
-// ------------------------------------------
-// rotate_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator rotate_copy_impl(const std::string& label,
-                                const ExecutionSpace& ex, InputIterator first,
-                                InputIterator n_first, InputIterator last,
-                                OutputIterator d_first) {
-  /*
-    algorithm is implemented as follows:
-
-    first 	   n_first		last
-    |		      |                  |
-    o  o  o  o  o  o  o  o  o  o  o  o
-
-    dest+0 -> first_n
-    dest+1 -> first_n+1
-    dest+2 -> first_n+2
-    dest+3 -> first
-    dest+4 -> first+1
-    dest+5 -> first+2
-    dest+6 -> first+3
-    dest+7 -> first+4
-    dest+8 -> first+5
-    ...
-    let shift = last - first_n;
-
-    then we have:
-    if (i < shift){
-      *(dest_first + i) = *(first_n + i);
-    }
-    else{
-      *(dest_first + i) = *(from + i - shift);
-    }
-  */
-
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first, d_first);
-  Impl::static_assert_iterators_have_matching_difference_type(first, d_first);
-  Impl::expect_valid_range(first, last);
-  Impl::expect_valid_range(first, n_first);
-  Impl::expect_valid_range(n_first, last);
-
-  if (first == last) {
-    return d_first;
-  }
-
-  // aliases
-  using index_type = typename InputIterator::difference_type;
-  using func_type =
-      StdRotateCopyFunctor<index_type, InputIterator, OutputIterator>;
-
-  // run
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                         func_type(first, last, n_first, d_first));
-
-  ex.fence("Kokkos::rotate_copy: fence after operation");
-
-  // return
-  return d_first + num_elements;
-}
-
-// ------------------------------------------
-// rotate_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType>
-IteratorType rotate_with_pivot_in_left_half(const std::string& label,
-                                            const ExecutionSpace& ex,
-                                            IteratorType first,
-                                            IteratorType n_first,
-                                            IteratorType last) {
-  /*
-    This impl is specific for when the n_first iterator points to
-    an element that is before or equal to the middle of the range.
-
-    If we have:
-
-    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 | *
-      ^           ^              mid					   ^
-    first       n_first							  last
-
-    In step 1, we create a temporary view with extent = distance(n_first, last)
-    and *move* the elements from [n_first, last) to tmp view, such that
-    tmp view becomes:
-
-    | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 |
-
-    In step 2, we move the elements in [first, n_first)
-    to the new position where they are supposed to end up.
-
-    In step 3, we move the elements from the tmp view to
-    the range starting at first.
-   */
-
-  namespace KE                     = ::Kokkos::Experimental;
-  const auto num_elements_on_left  = KE::distance(first, n_first);
-  const auto num_elements_on_right = KE::distance(n_first, last);
-
-  // create helper tmp view
-  using value_type    = typename IteratorType::value_type;
-  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-  tmp_view_type tmp_view("rotate_impl_for_pivot_in_left_half_impl",
-                         num_elements_on_right);
-  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-
-  // index_type is the same and needed in all steps
-  using index_type = typename IteratorType::difference_type;
-
-  // stage 1
-  using step1_func_type =
-      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_right),
-      step1_func_type(n_first, begin(tmp_view)));
-
-  // stage 2
-  using step2_func_type =
-      StdMoveFunctor<index_type, IteratorType, IteratorType>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_left),
-      step2_func_type(first, first + num_elements_on_right));
-
-  // step 3
-  using step3_func_type =
-      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-                         step3_func_type(begin(tmp_view), first));
-
-  ex.fence("Kokkos::rotate: fence after operation");
-  return first + (last - n_first);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType rotate_with_pivot_in_right_half(const std::string& label,
-                                             const ExecutionSpace& ex,
-                                             IteratorType first,
-                                             IteratorType n_first,
-                                             IteratorType last) {
-  /*
-    This impl is specific for when the n_first iterator points to
-    an element that is after the middle of the range.
-
-    If we have:
-
-    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 | -6 | -5 | 8 | 9 | 11 | *
-      ^                          mid            ^                          ^
-    first                                    n_first			  last
-
-    In step 1, we create a temporary view with extent = distance(first, n_first)
-    and *move* the elements from [first, n_first) to tmp view,
-    such that tmp view becomes:
-
-    | 0 | 1 | 2 | 1 | 4 | 5 | 2 | 2 | 10 | -3 | 1 |
-
-    In step 2, we move the elements in [n_first, last)
-    to the beginning where they are supposed to end up.
-
-    In step 3, we move the elements from the tmp view to
-    the range starting at first.
-   */
-
-  namespace KE                     = ::Kokkos::Experimental;
-  const auto num_elements_on_left  = KE::distance(first, n_first);
-  const auto num_elements_on_right = KE::distance(n_first, last);
-
-  // create helper tmp view
-  using value_type    = typename IteratorType::value_type;
-  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-  tmp_view_type tmp_view("rotate_impl_for_pivot_in_left_half_impl",
-                         num_elements_on_left);
-  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-
-  // index_type is the same and needed in all steps
-  using index_type = typename IteratorType::difference_type;
-
-  // stage 1
-  using step1_func_type =
-      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_left),
-      step1_func_type(first, begin(tmp_view)));
-
-  // stage 2
-  using step2_func_type =
-      StdMoveFunctor<index_type, IteratorType, IteratorType>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_on_right),
-      step2_func_type(n_first, first));
-
-  // step 3:
-  using step3_func_type =
-      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-      step3_func_type(begin(tmp_view), first + num_elements_on_right));
-
-  ex.fence("Kokkos::rotate: fence after operation");
-  return first + (last - n_first);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType rotate_impl(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType n_first,
-                         IteratorType last) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-  Impl::expect_valid_range(first, n_first);
-  Impl::expect_valid_range(n_first, last);
-
-  namespace KE                     = ::Kokkos::Experimental;
-  const auto num_elements          = KE::distance(first, last);
-  const auto n_distance_from_first = KE::distance(first, n_first);
-  if (n_distance_from_first <= num_elements / 2) {
-    return rotate_with_pivot_in_left_half(label, ex, first, n_first, last);
-  } else {
-    return rotate_with_pivot_in_right_half(label, ex, first, n_first, last);
-  }
-}
-
-// ------------------------------------------
-// remove_if_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class UnaryPredicateType>
-IteratorType remove_if_impl(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType first, IteratorType last,
-                            UnaryPredicateType pred) {
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-
-  if (first == last) {
-    return last;
-  } else {
-    // create tmp buffer to use to *move* all elements that we need to keep.
-    // note that the tmp buffer is just large enought to store
-    // all elements to keep, because ideally we do not need/want one
-    // as large as the original range.
-    // To allocate the right tmp view, we need a call to count_if.
-    // We could just do a "safe" allocation of a buffer as
-    // large as (last-first), but I think a call to count_if is more afforable.
-
-    // count how many elements we need to keep
-    // note that the elements to remove are those that meet the predicate
-    const auto remove_count =
-        ::Kokkos::Experimental::count_if(ex, first, last, pred);
-    const auto keep_count =
-        Kokkos::Experimental::distance(first, last) - remove_count;
-
-    // create helper tmp view
-    using value_type    = typename IteratorType::value_type;
-    using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-    tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count);
-    using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-
-    // in stage 1, *move* all elements to keep from original range to tmp
-    // we use similar impl as copy_if except that we *move* rather than copy
-    using index_type = typename IteratorType::difference_type;
-    using func1_type = StdRemoveIfStage1Functor<index_type, IteratorType,
-                                                tmp_readwrite_iterator_type,
-                                                UnaryPredicateType>;
-
-    const auto scan_num_elements = Kokkos::Experimental::distance(first, last);
-    index_type scan_count        = 0;
-    ::Kokkos::parallel_scan(
-        label, RangePolicy<ExecutionSpace>(ex, 0, scan_num_elements),
-        func1_type(first, begin(tmp_view), pred), scan_count);
-
-    // scan_count should be equal to keep_count
-    assert(scan_count == keep_count);
-    (void)scan_count;  // to avoid unused complaints
-
-    // stage 2, we do parfor to move from tmp to original range
-    using func2_type =
-        StdRemoveIfStage2Functor<index_type, tmp_readwrite_iterator_type,
-                                 IteratorType>;
-    ::Kokkos::parallel_for(
-        "remove_if_stage2_parfor",
-        RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-        func2_type(begin(tmp_view), first));
-    ex.fence("Kokkos::remove_if: fence after stage2");
-
-    // return
-    return first + keep_count;
-  }
-}
-
-// ------------------------------------------
-// remove_impl
-// ------------------------------------------
-template <class ExecutionSpace, class IteratorType, class ValueType>
-auto remove_impl(const std::string& label, const ExecutionSpace& ex,
-                 IteratorType first, IteratorType last,
-                 const ValueType& value) {
-  using predicate_type = StdAlgoEqualsValUnaryPredicate<ValueType>;
-  return remove_if_impl(label, ex, first, last, predicate_type(value));
-}
-
-// ------------------------------------------
-// remove_copy_impl
-// ------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-auto remove_copy_impl(const std::string& label, const ExecutionSpace& ex,
-                      InputIteratorType first_from, InputIteratorType last_from,
-                      OutputIteratorType first_dest, const ValueType& value) {
-  // this is like copy_if except that we need to *ignore* the elements
-  // that match the value, so we can solve this as follows:
-
-  using predicate_type = StdAlgoNotEqualsValUnaryPredicate<ValueType>;
-  return ::Kokkos::Experimental::copy_if(label, ex, first_from, last_from,
-                                         first_dest, predicate_type(value));
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class UnaryPredicate>
-auto remove_copy_if_impl(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first_from,
-                         InputIteratorType last_from,
-                         OutputIteratorType first_dest,
-                         const UnaryPredicate& pred) {
-  // this is like copy_if except that we need to *ignore* the elements
-  // satisfying the pred, so we can solve this as follows:
-
-  using value_type = typename InputIteratorType::value_type;
-  using pred_wrapper_type =
-      StdAlgoNegateUnaryPredicateWrapper<value_type, UnaryPredicate>;
-  return ::Kokkos::Experimental::copy_if(label, ex, first_from, last_from,
-                                         first_dest, pred_wrapper_type(pred));
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_left_impl(const std::string& label, const ExecutionSpace& ex,
-                             IteratorType first, IteratorType last,
-                             typename IteratorType::difference_type n) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-  KOKKOS_EXPECTS(n >= 0);
-
-  // handle trivial cases
-  if (n == 0) {
-    return last;
-  }
-
-  if (n >= Kokkos::Experimental::distance(first, last)) {
-    return first;
-  }
-
-  /*
-    Suppose that n = 5, and our [first,last) spans:
-
-    | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | -3 | 1  | -6 | *
-      ^                         				  ^
-    first							 last
-
-    shift_left modifies the range such that we have this data:
-    | 1  | 2  | 2  | 10  | -3 | 1  | -6 | x | x  | x  | x  |  x | *
-                                          ^
-                                   return it pointing here
-
-
-    and returns an iterator pointing to one past the new end.
-    Note: elements marked x are in undefined state because have been moved.
-
-    We implement this in two steps:
-    step 1:
-      we create a temporary view with extent = distance(first+n, last)
-      and *move* assign the elements from [first+n, last) to tmp view, such that
-      tmp view becomes:
-
-      | 1  | 2  | 2  | 10  | -3 | 1  | -6 |
-
-    step 2:
-      move elements of tmp view back to range starting at first.
-   */
-
-  const auto num_elements_to_move =
-      ::Kokkos::Experimental::distance(first + n, last);
-
-  // create tmp view
-  using value_type    = typename IteratorType::value_type;
-  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-  tmp_view_type tmp_view("shift_left_impl", num_elements_to_move);
-  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-
-  using index_type = typename IteratorType::difference_type;
-
-  // step 1
-  using step1_func_type =
-      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_move),
-      step1_func_type(first + n, begin(tmp_view)));
-
-  // step 2
-  using step2_func_type =
-      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-                         step2_func_type(begin(tmp_view), first));
-
-  ex.fence("Kokkos::shift_left: fence after operation");
-
-  return last - n;
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_right_impl(const std::string& label,
-                              const ExecutionSpace& ex, IteratorType first,
-                              IteratorType last,
-                              typename IteratorType::difference_type n) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first);
-  Impl::expect_valid_range(first, last);
-  KOKKOS_EXPECTS(n >= 0);
-
-  // handle trivial cases
-  if (n == 0) {
-    return first;
-  }
-
-  if (n >= Kokkos::Experimental::distance(first, last)) {
-    return last;
-  }
-
-  /*
-    Suppose that n = 3, and [first,last) spans:
-
-    | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | -3 | 1  | -6 | *
-      ^                         				  ^
-    first							 last
-
-    shift_right modifies the range such that we have this data:
-    |  x | x  | x  | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 | *
-                     ^
-             return it points here
-
-    and returns an iterator pointing to the new beginning.
-    Note: elements marked x are in undefined state because have been moved.
-
-    We implement this in two steps:
-    step 1:
-      we create a temporary view with extent = distance(first, last-n)
-      and *move* assign the elements from [first, last-n) to tmp view, such that
-      tmp view becomes:
-
-      | 0  | 1  |  2 | 1  | 2  | 1  | 2  | 2  | 10 |
-
-    step 2:
-      move elements of tmp view back to range starting at first+n.
-   */
-
-  const auto num_elements_to_move =
-      ::Kokkos::Experimental::distance(first, last - n);
-
-  // create tmp view
-  using value_type    = typename IteratorType::value_type;
-  using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>;
-  tmp_view_type tmp_view("shift_right_impl", num_elements_to_move);
-  using tmp_readwrite_iterator_type = decltype(begin(tmp_view));
-
-  using index_type = typename IteratorType::difference_type;
-
-  // step 1
-  using step1_func_type =
-      StdMoveFunctor<index_type, IteratorType, tmp_readwrite_iterator_type>;
-  ::Kokkos::parallel_for(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements_to_move),
-      step1_func_type(first, begin(tmp_view)));
-
-  // step 2
-  using step2_func_type =
-      StdMoveFunctor<index_type, tmp_readwrite_iterator_type, IteratorType>;
-  ::Kokkos::parallel_for(label,
-                         RangePolicy<ExecutionSpace>(ex, 0, tmp_view.extent(0)),
-                         step2_func_type(begin(tmp_view), first + n));
-
-  ex.fence("Kokkos::shift_right: fence after operation");
-
-  return first + n;
-}
-
-}  // namespace Impl
-
-// -------------------
-// reverse_copy
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator reverse_copy(const ExecutionSpace& ex, InputIterator first,
-                            InputIterator last, OutputIterator d_first) {
-  return Impl::reverse_copy_impl("Kokkos::reverse_copy_iterator_api_default",
-                                 ex, first, last, d_first);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator reverse_copy(const std::string& label, const ExecutionSpace& ex,
-                            InputIterator first, InputIterator last,
-                            OutputIterator d_first) {
-  return Impl::reverse_copy_impl(label, ex, first, last, d_first);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto reverse_copy(const ExecutionSpace& ex,
-                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::reverse_copy_impl("Kokkos::reverse_copy_view_api_default", ex,
-                                 cbegin(source), cend(source), begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto reverse_copy(const std::string& label, const ExecutionSpace& ex,
-                  const ::Kokkos::View<DataType1, Properties1...>& source,
-                  ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::reverse_copy_impl(label, ex, cbegin(source), cend(source),
-                                 begin(dest));
-}
-
-// -------------------
-// reverse
-// -------------------
-template <class ExecutionSpace, class InputIterator>
-void reverse(const ExecutionSpace& ex, InputIterator first,
-             InputIterator last) {
-  return Impl::reverse_impl("Kokkos::reverse_iterator_api_default", ex, first,
-                            last);
-}
-
-template <class ExecutionSpace, class InputIterator>
-void reverse(const std::string& label, const ExecutionSpace& ex,
-             InputIterator first, InputIterator last) {
-  return Impl::reverse_impl(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-void reverse(const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::reverse_impl("Kokkos::reverse_view_api_default", ex,
-                            KE::begin(view), KE::end(view));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-void reverse(const std::string& label, const ExecutionSpace& ex,
-             const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::reverse_impl(label, ex, KE::begin(view), KE::end(view));
-}
-
-// ----------------------
-// move
-// ----------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator move(const ExecutionSpace& ex, InputIterator first,
-                    InputIterator last, OutputIterator d_first) {
-  return Impl::move_impl("Kokkos::move_iterator_api_default", ex, first, last,
-                         d_first);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator move(const std::string& label, const ExecutionSpace& ex,
-                    InputIterator first, InputIterator last,
-                    OutputIterator d_first) {
-  return Impl::move_impl(label, ex, first, last, d_first);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto move(const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::move_impl("Kokkos::move_view_api_default", ex, begin(source),
-                         end(source), begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto move(const std::string& label, const ExecutionSpace& ex,
-          const ::Kokkos::View<DataType1, Properties1...>& source,
-          ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::move_impl(label, ex, begin(source), end(source), begin(dest));
-}
-
-// -------------------
-// move_backward
-// -------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 move_backward(const ExecutionSpace& ex, IteratorType1 first,
-                            IteratorType1 last, IteratorType2 d_last) {
-  return Impl::move_backward_impl("Kokkos::move_backward_iterator_api_default",
-                                  ex, first, last, d_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto move_backward(const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::move_backward_impl("Kokkos::move_backward_view_api_default", ex,
-                                  begin(source), end(source), end(dest));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 move_backward(const std::string& label, const ExecutionSpace& ex,
-                            IteratorType1 first, IteratorType1 last,
-                            IteratorType2 d_last) {
-  return Impl::move_backward_impl(label, ex, first, last, d_last);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto move_backward(const std::string& label, const ExecutionSpace& ex,
-                   const ::Kokkos::View<DataType1, Properties1...>& source,
-                   ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::move_backward_impl(label, ex, begin(source), end(source),
-                                  end(dest));
-}
-
-// ----------------------
-// swap_ranges
-// ----------------------
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 swap_ranges(const ExecutionSpace& ex, IteratorType1 first1,
-                          IteratorType1 last1, IteratorType2 first2) {
-  return Impl::swap_ranges_impl("Kokkos::swap_ranges_iterator_api_default", ex,
-                                first1, last1, first2);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto swap_ranges(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  assert(source.extent(0) == dest.extent(0));
-  return Impl::swap_ranges_impl("Kokkos::swap_ranges_view_api_default", ex,
-                                begin(source), end(source), begin(dest));
-}
-
-template <class ExecutionSpace, class IteratorType1, class IteratorType2>
-IteratorType2 swap_ranges(const std::string& label, const ExecutionSpace& ex,
-                          IteratorType1 first1, IteratorType1 last1,
-                          IteratorType2 first2) {
-  return Impl::swap_ranges_impl(label, ex, first1, last1, first2);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto swap_ranges(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  assert(source.extent(0) == dest.extent(0));
-  return Impl::swap_ranges_impl(label, ex, begin(source), end(source),
-                                begin(dest));
-}
-
-// -------------------
-// unique
-// -------------------
-// note: the enable_if below is to avoid "call to ... is ambiguous"
-// for example in the unit test when using a variadic function
-
-// overload set1
-template <class ExecutionSpace, class IteratorType>
-std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
-    const ExecutionSpace& ex, IteratorType first, IteratorType last) {
-  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
-                           last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-std::enable_if_t<!::Kokkos::is_view<IteratorType>::value, IteratorType> unique(
-    const std::string& label, const ExecutionSpace& ex, IteratorType first,
-    IteratorType last) {
-  return Impl::unique_impl(label, ex, first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto unique(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return ::Kokkos::Experimental::unique("Kokkos::unique_view_api_default", ex,
-                                        begin(view), end(view));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto unique(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return ::Kokkos::Experimental::unique(label, ex, begin(view), end(view));
-}
-
-// overload set2
-template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
-IteratorType unique(const ExecutionSpace& ex, IteratorType first,
-                    IteratorType last, BinaryPredicate pred) {
-  return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first,
-                           last, pred);
-}
-
-template <class ExecutionSpace, class IteratorType, class BinaryPredicate>
-IteratorType unique(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType last,
-                    BinaryPredicate pred) {
-  return Impl::unique_impl(label, ex, first, last, pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicate>
-auto unique(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            BinaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::unique_impl("Kokkos::unique_view_api_default", ex, begin(view),
-                           end(view), std::move(pred));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class BinaryPredicate>
-auto unique(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            BinaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::unique_impl(label, ex, begin(view), end(view), std::move(pred));
-}
-
-// -------------------
-// unique_copy
-// -------------------
-// note: the enable_if below is to avoid "call to ... is ambiguous"
-// for example in the unit test when using a variadic function
-
-// overload set1
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
-unique_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last,
-            OutputIterator d_first) {
-  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
-                                first, last, d_first);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-std::enable_if_t<!::Kokkos::is_view<InputIterator>::value, OutputIterator>
-unique_copy(const std::string& label, const ExecutionSpace& ex,
-            InputIterator first, InputIterator last, OutputIterator d_first) {
-  return Impl::unique_copy_impl(label, ex, first, last, d_first);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto unique_copy(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return ::Kokkos::Experimental::unique_copy(
-      "Kokkos::unique_copy_view_api_default", ex, cbegin(source), cend(source),
-      begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto unique_copy(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return ::Kokkos::Experimental::unique_copy(label, ex, cbegin(source),
-                                             cend(source), begin(dest));
-}
-
-// overload set2
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class BinaryPredicate>
-OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first,
-                           InputIterator last, OutputIterator d_first,
-                           BinaryPredicate pred) {
-  return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex,
-                                first, last, d_first, pred);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class BinaryPredicate>
-OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex,
-                           InputIterator first, InputIterator last,
-                           OutputIterator d_first, BinaryPredicate pred) {
-  return Impl::unique_copy_impl(label, ex, first, last, d_first, pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicate>
-auto unique_copy(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest,
-                 BinaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::unique_copy_impl("Kokkos::unique_copy_view_api_default", ex,
-                                cbegin(source), cend(source), begin(dest),
-                                std::move(pred));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryPredicate>
-auto unique_copy(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest,
-                 BinaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::unique_copy_impl(label, ex, cbegin(source), cend(source),
-                                begin(dest), std::move(pred));
-}
-
-// -------------------
-// rotate
-// -------------------
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType rotate(const ExecutionSpace& ex, IteratorType first,
-                    IteratorType n_first, IteratorType last) {
-  return Impl::rotate_impl("Kokkos::rotate_iterator_api_default", ex, first,
-                           n_first, last);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType rotate(const std::string& label, const ExecutionSpace& ex,
-                    IteratorType first, IteratorType n_first,
-                    IteratorType last) {
-  return Impl::rotate_impl(label, ex, first, n_first, last);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto rotate(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            std::size_t n_location) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::rotate_impl("Kokkos::rotate_view_api_default", ex, begin(view),
-                           begin(view) + n_location, end(view));
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto rotate(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            std::size_t n_location) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::rotate_impl(label, ex, begin(view), begin(view) + n_location,
-                           end(view));
-}
-
-// -------------------
-// rotate_copy
-// -------------------
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator rotate_copy(const ExecutionSpace& ex, InputIterator first,
-                           InputIterator n_first, InputIterator last,
-                           OutputIterator d_first) {
-  return Impl::rotate_copy_impl("Kokkos::rotate_copy_iterator_api_default", ex,
-                                first, n_first, last, d_first);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator>
-OutputIterator rotate_copy(const std::string& label, const ExecutionSpace& ex,
-                           InputIterator first, InputIterator n_first,
-                           InputIterator last, OutputIterator d_first) {
-  return Impl::rotate_copy_impl(label, ex, first, n_first, last, d_first);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto rotate_copy(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 std::size_t n_location,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::rotate_copy_impl("Kokkos::rotate_copy_view_api_default", ex,
-                                cbegin(source), cbegin(source) + n_location,
-                                cend(source), begin(dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto rotate_copy(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& source,
-                 std::size_t n_location,
-                 const ::Kokkos::View<DataType2, Properties2...>& dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest);
-
-  return Impl::rotate_copy_impl(label, ex, cbegin(source),
-                                cbegin(source) + n_location, cend(source),
-                                begin(dest));
-}
-
-// -------------------
-// remove_if
-// -------------------
-template <class ExecutionSpace, class Iterator, class UnaryPredicate>
-Iterator remove_if(const ExecutionSpace& ex, Iterator first, Iterator last,
-                   UnaryPredicate pred) {
-  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
-                              first, last, pred);
-}
-
-template <class ExecutionSpace, class Iterator, class UnaryPredicate>
-Iterator remove_if(const std::string& label, const ExecutionSpace& ex,
-                   Iterator first, Iterator last, UnaryPredicate pred) {
-  return Impl::remove_if_impl(label, ex, first, last, pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryPredicate>
-auto remove_if(const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view,
-               UnaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-
-  return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex,
-                              ::Kokkos::Experimental::begin(view),
-                              ::Kokkos::Experimental::end(view), pred);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class UnaryPredicate>
-auto remove_if(const std::string& label, const ExecutionSpace& ex,
-               const ::Kokkos::View<DataType, Properties...>& view,
-               UnaryPredicate pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_if_impl(label, ex, ::Kokkos::Experimental::begin(view),
-                              ::Kokkos::Experimental::end(view), pred);
-}
-
-// -------------------
-// remove
-// -------------------
-template <class ExecutionSpace, class Iterator, class ValueType>
-Iterator remove(const ExecutionSpace& ex, Iterator first, Iterator last,
-                const ValueType& value) {
-  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex, first,
-                           last, value);
-}
-
-template <class ExecutionSpace, class Iterator, class ValueType>
-Iterator remove(const std::string& label, const ExecutionSpace& ex,
-                Iterator first, Iterator last, const ValueType& value) {
-  return Impl::remove_impl(label, ex, first, last, value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
-auto remove(const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex,
-                           ::Kokkos::Experimental::begin(view),
-                           ::Kokkos::Experimental::end(view), value);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties,
-          class ValueType>
-auto remove(const std::string& label, const ExecutionSpace& ex,
-            const ::Kokkos::View<DataType, Properties...>& view,
-            const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::remove_impl(label, ex, ::Kokkos::Experimental::begin(view),
-                           ::Kokkos::Experimental::end(view), value);
-}
-
-// -------------------
-// remove_copy
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
-OutputIterator remove_copy(const ExecutionSpace& ex, InputIterator first_from,
-                           InputIterator last_from, OutputIterator first_dest,
-                           const ValueType& value) {
-  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
-                                first_from, last_from, first_dest, value);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class ValueType>
-OutputIterator remove_copy(const std::string& label, const ExecutionSpace& ex,
-                           InputIterator first_from, InputIterator last_from,
-                           OutputIterator first_dest, const ValueType& value) {
-  return Impl::remove_copy_impl(label, ex, first_from, last_from, first_dest,
-                                value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto remove_copy(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                 const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                 const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-
-  return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex,
-                                ::Kokkos::Experimental::cbegin(view_from),
-                                ::Kokkos::Experimental::cend(view_from),
-                                ::Kokkos::Experimental::begin(view_dest),
-                                value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto remove_copy(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                 const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                 const ValueType& value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-
-  return Impl::remove_copy_impl(
-      label, ex, ::Kokkos::Experimental::cbegin(view_from),
-      ::Kokkos::Experimental::cend(view_from),
-      ::Kokkos::Experimental::begin(view_dest), value);
-}
-
-// -------------------
-// remove_copy_if
-// -------------------
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryPredicate>
-OutputIterator remove_copy_if(const ExecutionSpace& ex,
-                              InputIterator first_from, InputIterator last_from,
-                              OutputIterator first_dest,
-                              const UnaryPredicate& pred) {
-  return Impl::remove_copy_if_impl(
-      "Kokkos::remove_copy_if_iterator_api_default", ex, first_from, last_from,
-      first_dest, pred);
-}
-
-template <class ExecutionSpace, class InputIterator, class OutputIterator,
-          class UnaryPredicate>
-OutputIterator remove_copy_if(const std::string& label,
-                              const ExecutionSpace& ex,
-                              InputIterator first_from, InputIterator last_from,
-                              OutputIterator first_dest,
-                              const UnaryPredicate& pred) {
-  return Impl::remove_copy_if_impl(label, ex, first_from, last_from, first_dest,
-                                   pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryPredicate>
-auto remove_copy_if(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    const UnaryPredicate& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-
-  return Impl::remove_copy_if_impl(
-      "Kokkos::remove_copy_if_iterator_api_default", ex,
-      ::Kokkos::Experimental::cbegin(view_from),
-      ::Kokkos::Experimental::cend(view_from),
-      ::Kokkos::Experimental::begin(view_dest), pred);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class UnaryPredicate>
-auto remove_copy_if(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    const UnaryPredicate& pred) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-
-  return Impl::remove_copy_if_impl(
-      label, ex, ::Kokkos::Experimental::cbegin(view_from),
-      ::Kokkos::Experimental::cend(view_from),
-      ::Kokkos::Experimental::begin(view_dest), pred);
-}
-
-// -------------------
-// shift_left
-// -------------------
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_left(const ExecutionSpace& ex, IteratorType first,
-                        IteratorType last,
-                        typename IteratorType::difference_type n) {
-  return Impl::shift_left_impl("Kokkos::shift_left_iterator_api_default", ex,
-                               first, last, n);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_left(const std::string& label, const ExecutionSpace& ex,
-                        IteratorType first, IteratorType last,
-                        typename IteratorType::difference_type n) {
-  return Impl::shift_left_impl(label, ex, first, last, n);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto shift_left(const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& view,
-                typename decltype(begin(view))::difference_type n) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_left_impl("Kokkos::shift_left_view_api_default", ex,
-                               begin(view), end(view), n);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto shift_left(const std::string& label, const ExecutionSpace& ex,
-                const ::Kokkos::View<DataType, Properties...>& view,
-                typename decltype(begin(view))::difference_type n) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_left_impl(label, ex, begin(view), end(view), n);
-}
-
-// -------------------
-// shift_right
-// -------------------
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_right(const ExecutionSpace& ex, IteratorType first,
-                         IteratorType last,
-                         typename IteratorType::difference_type n) {
-  return Impl::shift_right_impl("Kokkos::shift_right_iterator_api_default", ex,
-                                first, last, n);
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType shift_right(const std::string& label, const ExecutionSpace& ex,
-                         IteratorType first, IteratorType last,
-                         typename IteratorType::difference_type n) {
-  return Impl::shift_right_impl(label, ex, first, last, n);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto shift_right(const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& view,
-                 typename decltype(begin(view))::difference_type n) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_right_impl("Kokkos::shift_right_view_api_default", ex,
-                                begin(view), end(view), n);
-}
-
-template <class ExecutionSpace, class DataType, class... Properties>
-auto shift_right(const std::string& label, const ExecutionSpace& ex,
-                 const ::Kokkos::View<DataType, Properties...>& view,
-                 typename decltype(begin(view))::difference_type n) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view);
-  return Impl::shift_right_impl(label, ex, begin(view), end(view), n);
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_ExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_ExclusiveScan.hpp
deleted file mode 100644
index 62ebbec42..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_ExclusiveScan.hpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STD_NUMERICS_EXCLUSIVE_SCAN_HPP
-#define KOKKOS_STD_NUMERICS_EXCLUSIVE_SCAN_HPP
-
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_Distance.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_ValueWrapperForNoNeutralElement.hpp"
-#include "Kokkos_IdentityReferenceUnaryFunctor.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest>
-struct ExclusiveScanDefaultFunctor {
-  using execution_space = ExeSpace;
-  using value_type =
-      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
-
-  ValueType m_init_value;
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-
-  KOKKOS_FUNCTION
-  ExclusiveScanDefaultFunctor(ValueType init, FirstFrom first_from,
-                              FirstDest first_dest)
-      : m_init_value(std::move(init)),
-        m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    if (final_pass) {
-      if (i == 0) {
-        m_first_dest[i] = m_init_value;
-      } else {
-        m_first_dest[i] = update.val + m_init_value;
-      }
-    }
-
-    const auto tmp = value_type{m_first_from[i], false};
-    this->join(update, tmp);
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    if (update.is_initial) {
-      update.val        = input.val;
-      update.is_initial = false;
-    } else {
-      update.val = update.val + input.val;
-    }
-  }
-};
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformExclusiveScanFunctor {
-  using execution_space = ExeSpace;
-  using value_type =
-      ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement<ValueType>;
-
-  ValueType m_init_value;
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  BinaryOpType m_binary_op;
-  UnaryOpType m_unary_op;
-
-  KOKKOS_FUNCTION
-  TransformExclusiveScanFunctor(ValueType init, FirstFrom first_from,
-                                FirstDest first_dest, BinaryOpType bop,
-                                UnaryOpType uop)
-      : m_init_value(std::move(init)),
-        m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_binary_op(std::move(bop)),
-        m_unary_op(std::move(uop)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    if (final_pass) {
-      if (i == 0) {
-        // for both ExclusiveScan and TransformExclusiveScan,
-        // init is unmodified
-        m_first_dest[i] = m_init_value;
-      } else {
-        m_first_dest[i] = m_binary_op(update.val, m_init_value);
-      }
-    }
-
-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
-    this->join(update, tmp);
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    if (update.is_initial) {
-      update.val = input.val;
-    } else {
-      update.val = m_binary_op(update.val, input.val);
-    }
-    update.is_initial = false;
-  }
-};
-
-// --------------------------------------------------
-// exclusive_scan_custom_op_impl
-// --------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-OutputIteratorType exclusive_scan_custom_op_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type    = typename InputIteratorType::difference_type;
-  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
-  using func_type =
-      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
-                                    InputIteratorType, OutputIteratorType,
-                                    BinaryOpType, unary_op_type>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(init_value, first_from, first_dest, bop, unary_op_type()));
-  ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// --------------------------------------------------
-// transform_exclusive_scan_impl
-// --------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType,
-          class UnaryOpType>
-OutputIteratorType transform_exclusive_scan_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop,
-    UnaryOpType uop) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_type =
-      TransformExclusiveScanFunctor<ExecutionSpace, index_type, ValueType,
-                                    InputIteratorType, OutputIteratorType,
-                                    BinaryOpType, UnaryOpType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(init_value, first_from, first_dest, bop, uop));
-  ex.fence("Kokkos::transform_exclusive_scan: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// --------------------------------------------------
-// exclusive_scan_default_op_impl
-// --------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-OutputIteratorType exclusive_scan_default_op_impl(const std::string& label,
-                                                  const ExecutionSpace& ex,
-                                                  InputIteratorType first_from,
-                                                  InputIteratorType last_from,
-                                                  OutputIteratorType first_dest,
-                                                  ValueType init_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // does it make sense to do this static_assert too?
-  // using input_iterator_value_type = typename InputIteratorType::value_type;
-  // static_assert
-  //   (std::is_convertible<std::remove_cv_t<input_iterator_value_type>,
-  //   ValueType>::value,
-  //    "exclusive_scan: InputIteratorType::value_type not convertible to
-  //    ValueType");
-
-  // we are unnecessarily duplicating code, but this is on purpose
-  // so that we can use the default_op for OpenMPTarget.
-  // Originally, I had this implemented as:
-  // '''
-  // using bop_type   = StdExclusiveScanDefaultJoinFunctor<ValueType>;
-  // call exclusive_scan_custom_op_impl(..., bop_type());
-  // '''
-  // which avoids duplicating the functors, but for OpenMPTarget
-  // I cannot use a custom binary op.
-  // This is the same problem that occurs for reductions.
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_type =
-      ExclusiveScanDefaultFunctor<ExecutionSpace, index_type, ValueType,
-                                  InputIteratorType, OutputIteratorType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(label,
-                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                          func_type(init_value, first_from, first_dest));
-  ex.fence("Kokkos::exclusive_scan_default_op: fence after operation");
-
-  return first_dest + num_elements;
-}
-
-}  // end namespace Impl
-
-///////////////////////////////
-//
-// exclusive scan API
-//
-///////////////////////////////
-
-// overload set 1
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               ValueType init_value) {
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::exclusive_scan_default_op_impl(
-      "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last,
-      first_dest, init_value);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, ValueType init_value) {
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::exclusive_scan_default_op_impl(label, ex, first, last,
-                                              first_dest, init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto exclusive_scan(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    ValueType init_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_default_op_impl(
-      "Kokkos::exclusive_scan_default_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType>
-auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    ValueType init_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
-                                              KE::cend(view_from),
-                                              KE::begin(view_dest), init_value);
-}
-
-// overload set 2
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               ValueType init_value, BinaryOpType bop) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::exclusive_scan_custom_op_impl(
-      "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last,
-      first_dest, init_value, bop);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, ValueType init_value,
-               BinaryOpType bop) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::exclusive_scan_custom_op_impl(label, ex, first, last, first_dest,
-                                             init_value, bop);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType>
-auto exclusive_scan(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    ValueType init_value, BinaryOpType bop) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_custom_op_impl(
-      "Kokkos::exclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value, bop);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType>
-auto exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    ValueType init_value, BinaryOpType bop) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::exclusive_scan_custom_op_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), init_value, bop);
-}
-
-//////////////////////////////////////
-//
-// transform_exclusive_scan public API
-//
-//////////////////////////////////////
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType,
-          class UnaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_exclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         ValueType init_value, BinaryOpType binary_op,
-                         UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::transform_exclusive_scan_impl(
-      "Kokkos::transform_exclusive_scan_custom_functors_iterator_api", ex,
-      first, last, first_dest, init_value, binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class ValueType, class BinaryOpType,
-          class UnaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_exclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, ValueType init_value,
-                         BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  return Impl::transform_exclusive_scan_impl(label, ex, first, last, first_dest,
-                                             init_value, binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType, class UnaryOpType>
-auto transform_exclusive_scan(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_exclusive_scan_impl(
-      "Kokkos::transform_exclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      init_value, binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class ValueType,
-          class BinaryOpType, class UnaryOpType>
-auto transform_exclusive_scan(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  static_assert(std::is_move_constructible<ValueType>::value,
-                "ValueType must be move constructible.");
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_exclusive_scan_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), init_value, binary_op, unary_op);
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_InclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_InclusiveScan.hpp
deleted file mode 100644
index cdafc8186..000000000
--- a/packages/kokkos/algorithms/src/std_algorithms/numeric/Kokkos_InclusiveScan.hpp
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STD_NUMERICS_INCLUSIVE_SCAN_HPP
-#define KOKKOS_STD_NUMERICS_INCLUSIVE_SCAN_HPP
-
-#include <Kokkos_Core.hpp>
-#include "../Kokkos_BeginEnd.hpp"
-#include "../Kokkos_Constraints.hpp"
-#include "../Kokkos_Distance.hpp"
-#include "../Kokkos_ModifyingOperations.hpp"
-#include "../Kokkos_ValueWrapperForNoNeutralElement.hpp"
-#include "Kokkos_IdentityReferenceUnaryFunctor.hpp"
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest>
-struct InclusiveScanDefaultFunctor {
-  using execution_space = ExeSpace;
-  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
-
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-
-  KOKKOS_FUNCTION
-  InclusiveScanDefaultFunctor(FirstFrom first_from, FirstDest first_dest)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    const auto tmp = value_type{m_first_from[i], false};
-    this->join(update, tmp);
-
-    if (final_pass) {
-      m_first_dest[i] = update.val;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    if (update.is_initial) {
-      update.val = input.val;
-    } else {
-      update.val = update.val + input.val;
-    }
-    update.is_initial = false;
-  }
-};
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformInclusiveScanNoInitValueFunctor {
-  using execution_space = ExeSpace;
-  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
-
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  BinaryOpType m_binary_op;
-  UnaryOpType m_unary_op;
-
-  KOKKOS_FUNCTION
-  TransformInclusiveScanNoInitValueFunctor(FirstFrom first_from,
-                                           FirstDest first_dest,
-                                           BinaryOpType bop, UnaryOpType uop)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_binary_op(std::move(bop)),
-        m_unary_op(std::move(uop)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
-    this->join(update, tmp);
-    if (final_pass) {
-      m_first_dest[i] = update.val;
-    }
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    if (update.is_initial) {
-      update.val = input.val;
-    } else {
-      update.val = m_binary_op(update.val, input.val);
-    }
-    update.is_initial = false;
-  }
-};
-
-template <class ExeSpace, class IndexType, class ValueType, class FirstFrom,
-          class FirstDest, class BinaryOpType, class UnaryOpType>
-struct TransformInclusiveScanWithInitValueFunctor {
-  using execution_space = ExeSpace;
-  using value_type      = ValueWrapperForNoNeutralElement<ValueType>;
-
-  FirstFrom m_first_from;
-  FirstDest m_first_dest;
-  BinaryOpType m_binary_op;
-  UnaryOpType m_unary_op;
-  ValueType m_init;
-
-  KOKKOS_FUNCTION
-  TransformInclusiveScanWithInitValueFunctor(FirstFrom first_from,
-                                             FirstDest first_dest,
-                                             BinaryOpType bop, UnaryOpType uop,
-                                             ValueType init)
-      : m_first_from(std::move(first_from)),
-        m_first_dest(std::move(first_dest)),
-        m_binary_op(std::move(bop)),
-        m_unary_op(std::move(uop)),
-        m_init(std::move(init)) {}
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, value_type& update,
-                  const bool final_pass) const {
-    const auto tmp = value_type{m_unary_op(m_first_from[i]), false};
-    this->join(update, tmp);
-
-    if (final_pass) {
-      m_first_dest[i] = m_binary_op(update.val, m_init);
-    }
-  }
-
-  KOKKOS_FUNCTION
-  void init(value_type& update) const {
-    update.val        = {};
-    update.is_initial = true;
-  }
-
-  KOKKOS_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
-    if (update.is_initial) {
-      update.val = input.val;
-    } else {
-      update.val = m_binary_op(update.val, input.val);
-    }
-    update.is_initial = false;
-  }
-};
-
-// -------------------------------------------------------------
-// inclusive_scan_default_op_impl
-// -------------------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-OutputIteratorType inclusive_scan_default_op_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using value_type =
-      std::remove_const_t<typename InputIteratorType::value_type>;
-  using func_type =
-      InclusiveScanDefaultFunctor<ExecutionSpace, index_type, value_type,
-                                  InputIteratorType, OutputIteratorType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(label,
-                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                          func_type(first_from, first_dest));
-  ex.fence("Kokkos::inclusive_scan_default_op: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// -------------------------------------------------------------
-// inclusive_scan_custom_binary_op_impl
-// -------------------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType>
-OutputIteratorType inclusive_scan_custom_binary_op_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, BinaryOpType binary_op) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using value_type =
-      std::remove_const_t<typename InputIteratorType::value_type>;
-  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<value_type>;
-  using func_type     = TransformInclusiveScanNoInitValueFunctor<
-      ExecutionSpace, index_type, value_type, InputIteratorType,
-      OutputIteratorType, BinaryOpType, unary_op_type>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(first_from, first_dest, binary_op, unary_op_type()));
-  ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// -------------------------------------------------------------
-// inclusive_scan_custom_binary_op_impl with init_value
-// -------------------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class ValueType>
-OutputIteratorType inclusive_scan_custom_binary_op_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, BinaryOpType binary_op,
-    ValueType init_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type    = typename InputIteratorType::difference_type;
-  using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor<ValueType>;
-  using func_type     = TransformInclusiveScanWithInitValueFunctor<
-      ExecutionSpace, index_type, ValueType, InputIteratorType,
-      OutputIteratorType, BinaryOpType, unary_op_type>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(label,
-                          RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-                          func_type(first_from, first_dest, binary_op,
-                                    unary_op_type(), init_value));
-  ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// -------------------------------------------------------------
-// transform_inclusive_scan_impl without init_value
-// -------------------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-OutputIteratorType transform_inclusive_scan_impl(const std::string& label,
-                                                 const ExecutionSpace& ex,
-                                                 InputIteratorType first_from,
-                                                 InputIteratorType last_from,
-                                                 OutputIteratorType first_dest,
-                                                 BinaryOpType binary_op,
-                                                 UnaryOpType unary_op) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using value_type =
-      std::remove_const_t<typename InputIteratorType::value_type>;
-  using func_type = TransformInclusiveScanNoInitValueFunctor<
-      ExecutionSpace, index_type, value_type, InputIteratorType,
-      OutputIteratorType, BinaryOpType, UnaryOpType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(first_from, first_dest, binary_op, unary_op));
-  ex.fence("Kokkos::transform_inclusive_scan: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-// -------------------------------------------------------------
-// transform_inclusive_scan_impl with init_value
-// -------------------------------------------------------------
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
-          class ValueType>
-OutputIteratorType transform_inclusive_scan_impl(
-    const std::string& label, const ExecutionSpace& ex,
-    InputIteratorType first_from, InputIteratorType last_from,
-    OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op,
-    ValueType init_value) {
-  // checks
-  Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest);
-  Impl::static_assert_iterators_have_matching_difference_type(first_from,
-                                                              first_dest);
-  Impl::expect_valid_range(first_from, last_from);
-
-  // aliases
-  using index_type = typename InputIteratorType::difference_type;
-  using func_type  = TransformInclusiveScanWithInitValueFunctor<
-      ExecutionSpace, index_type, ValueType, InputIteratorType,
-      OutputIteratorType, BinaryOpType, UnaryOpType>;
-
-  // run
-  const auto num_elements =
-      Kokkos::Experimental::distance(first_from, last_from);
-  ::Kokkos::parallel_scan(
-      label, RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_type(first_from, first_dest, binary_op, unary_op, init_value));
-  ex.fence("Kokkos::transform_inclusive_scan: fence after operation");
-
-  // return
-  return first_dest + num_elements;
-}
-
-}  // end namespace Impl
-
-///////////////////////////////
-//
-// inclusive scan API
-//
-///////////////////////////////
-
-// overload set 1
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest) {
-  return Impl::inclusive_scan_default_op_impl(
-      "Kokkos::inclusive_scan_default_functors_iterator_api", ex, first, last,
-      first_dest);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest) {
-  return Impl::inclusive_scan_default_op_impl(label, ex, first, last,
-                                              first_dest);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto inclusive_scan(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_default_op_impl(
-      "Kokkos::inclusive_scan_default_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest));
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2>
-auto inclusive_scan(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from),
-                                              KE::cend(view_from),
-                                              KE::begin(view_dest));
-}
-
-// overload set 2 (accepting custom binary op)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               BinaryOp binary_op) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
-      first_dest, binary_op);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, BinaryOp binary_op) {
-  return Impl::inclusive_scan_custom_binary_op_impl(label, ex, first, last,
-                                                    first_dest, binary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
-auto inclusive_scan(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    BinaryOp binary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      "Kokkos::inclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp>
-auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    BinaryOp binary_op) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op);
-}
-
-// overload set 3
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp, class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-               InputIteratorType last, OutputIteratorType first_dest,
-               BinaryOp binary_op, ValueType init_value) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last,
-      first_dest, binary_op, init_value);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOp, class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-               InputIteratorType first, InputIteratorType last,
-               OutputIteratorType first_dest, BinaryOp binary_op,
-               ValueType init_value) {
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      label, ex, first, last, first_dest, binary_op, init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp,
-          class ValueType>
-auto inclusive_scan(const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    BinaryOp binary_op, ValueType init_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      "Kokkos::inclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op, init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOp,
-          class ValueType>
-auto inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-                    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-                    BinaryOp binary_op, ValueType init_value) {
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::inclusive_scan_custom_binary_op_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op, init_value);
-}
-
-//////////////////////////////////////
-//
-// transform_inclusive_scan public API
-//
-//////////////////////////////////////
-
-// overload set 1 (no init value)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::transform_inclusive_scan_impl(
-      "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
-      first, last, first_dest, binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, BinaryOpType binary_op,
-                         UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-
-  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
-                                             binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType>
-auto transform_inclusive_scan(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
-      "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op, unary_op);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType>
-auto transform_inclusive_scan(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    BinaryOpType binary_op, UnaryOpType unary_op) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op, unary_op);
-}
-
-// overload set 2 (init value)
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
-          class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first,
-                         InputIteratorType last, OutputIteratorType first_dest,
-                         BinaryOpType binary_op, UnaryOpType unary_op,
-                         ValueType init_value) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::transform_inclusive_scan_impl(
-      "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex,
-      first, last, first_dest, binary_op, unary_op, init_value);
-}
-
-template <class ExecutionSpace, class InputIteratorType,
-          class OutputIteratorType, class BinaryOpType, class UnaryOpType,
-          class ValueType>
-std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators<
-                      InputIteratorType, OutputIteratorType>::value,
-                  OutputIteratorType>
-transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex,
-                         InputIteratorType first, InputIteratorType last,
-                         OutputIteratorType first_dest, BinaryOpType binary_op,
-                         UnaryOpType unary_op, ValueType init_value) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest,
-                                             binary_op, unary_op, init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType, class ValueType>
-auto transform_inclusive_scan(
-    const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
-      "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex,
-      KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest),
-      binary_op, unary_op, init_value);
-}
-
-template <class ExecutionSpace, class DataType1, class... Properties1,
-          class DataType2, class... Properties2, class BinaryOpType,
-          class UnaryOpType, class ValueType>
-auto transform_inclusive_scan(
-    const std::string& label, const ExecutionSpace& ex,
-    const ::Kokkos::View<DataType1, Properties1...>& view_from,
-    const ::Kokkos::View<DataType2, Properties2...>& view_dest,
-    BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) {
-  Impl::static_assert_is_not_openmptarget(ex);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from);
-  Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest);
-  namespace KE = ::Kokkos::Experimental;
-  return Impl::transform_inclusive_scan_impl(
-      label, ex, KE::cbegin(view_from), KE::cend(view_from),
-      KE::begin(view_dest), binary_op, unary_op, init_value);
-}
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#endif
diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
index 94e6b2784..0c50ff7a0 100644
--- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -80,6 +80,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget)
 	StdAlgorithmsSearch
 	StdAlgorithmsSearch_n
 	StdAlgorithmsMismatch
+	StdAlgorithmsMoveBackward
 	)
       list(APPEND STDALGO_SOURCES_C Test${Name}.cpp)
     endforeach()
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index 464c86a7b..19c82003c 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -98,16 +98,6 @@ struct RandomProperties {
     max = add.max > max ? add.max : max;
     return *this;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile RandomProperties& add) volatile {
-    count += add.count;
-    mean += add.mean;
-    variance += add.variance;
-    covariance += add.covariance;
-    min = add.min < min ? add.min : min;
-    max = add.max > max ? add.max : max;
-  }
 };
 
 // FIXME_OPENMPTARGET: Need this for OpenMPTarget because contra to the standard
@@ -532,13 +522,15 @@ struct TestDynRankView {
     Pool random(13);
     double min = 10.;
     double max = 100.;
-    Kokkos::fill_random(A, random, min, max);
+    ExecutionSpace exec;
+    Kokkos::fill_random(exec, A, random, min, max);
 
     ReducerValueType val;
-    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, A.size()),
-                            *this, ReducerType(val));
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<ExecutionSpace>(exec, 0, A.size()), *this,
+        ReducerType(val));
 
-    Kokkos::fence();
+    exec.fence();
     ASSERT_GE(val.min_val, min);
     ASSERT_LE(val.max_val, max);
   }
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
index 23e8fec7d..5136ad434 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp
@@ -43,9 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_RandomAccessIterator.hpp>
-#include <std_algorithms/Kokkos_Distance.hpp>
 
 namespace KE = Kokkos::Experimental;
 
@@ -206,28 +203,28 @@ TEST_F(random_access_iterator_test, operatorsSet4) {
   auto it4 = KE::Impl::RandomAccessIterator<static_view_t>(m_static_view, 4);
   auto it5 = KE::Impl::RandomAccessIterator<dyn_view_t>(m_dynamic_view, 4);
   auto it6 = KE::Impl::RandomAccessIterator<strided_view_t>(m_strided_view, 4);
-  EXPECT_TRUE(it1 != it4);
-  EXPECT_TRUE(it2 != it5);
-  EXPECT_TRUE(it3 != it6);
-  EXPECT_TRUE(it1 < it4);
-  EXPECT_TRUE(it2 < it5);
-  EXPECT_TRUE(it3 < it6);
-  EXPECT_TRUE(it1 <= it4);
-  EXPECT_TRUE(it2 <= it5);
-  EXPECT_TRUE(it3 <= it6);
+  EXPECT_NE(it1, it4);
+  EXPECT_NE(it2, it5);
+  EXPECT_NE(it3, it6);
+  EXPECT_LT(it1, it4);
+  EXPECT_LT(it2, it5);
+  EXPECT_LT(it3, it6);
+  EXPECT_LE(it1, it4);
+  EXPECT_LE(it2, it5);
+  EXPECT_LE(it3, it6);
 
   auto it7 = KE::Impl::RandomAccessIterator<static_view_t>(m_static_view, 3);
   auto it8 = KE::Impl::RandomAccessIterator<dyn_view_t>(m_dynamic_view, 3);
   auto it9 = KE::Impl::RandomAccessIterator<strided_view_t>(m_strided_view, 3);
-  EXPECT_TRUE(it1 == it7);
-  EXPECT_TRUE(it2 == it8);
-  EXPECT_TRUE(it3 == it9);
-  EXPECT_TRUE(it1 >= it7);
-  EXPECT_TRUE(it2 >= it8);
-  EXPECT_TRUE(it3 >= it9);
-  EXPECT_TRUE(it4 > it7);
-  EXPECT_TRUE(it5 > it8);
-  EXPECT_TRUE(it6 > it9);
+  EXPECT_EQ(it1, it7);
+  EXPECT_EQ(it2, it8);
+  EXPECT_EQ(it3, it9);
+  EXPECT_GE(it1, it7);
+  EXPECT_GE(it2, it8);
+  EXPECT_GE(it3, it9);
+  EXPECT_GT(it4, it7);
+  EXPECT_GT(it5, it8);
+  EXPECT_GT(it6, it9);
 }
 
 TEST_F(random_access_iterator_test, assignment_operator) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
index 9108731c1..120a04bdb 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -137,7 +137,12 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   // Test sorting array with all numbers equal
   ExecutionSpace exec;
   Kokkos::deep_copy(exec, keys, KeyType(1));
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   Kokkos::sort(exec, keys, force_kokkos);
+#else
+  (void)force_kokkos;  // suppress warnings about unused variable
+  Kokkos::sort(exec, keys);
+#endif
 
   Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
   Kokkos::fill_random(keys, g,
@@ -151,7 +156,11 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
                           sum<ExecutionSpace, KeyType>(keys), sum_before);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   Kokkos::sort(exec, keys, force_kokkos);
+#else
+  Kokkos::sort(exec, keys);
+#endif
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
                           sum<ExecutionSpace, KeyType>(keys), sum_after);
@@ -396,7 +405,7 @@ void test_sort_integer_overflow() {
             Kokkos::Experimental::finite_min<T>::value};
   auto vd = Kokkos::create_mirror_view_and_copy(
       ExecutionSpace(), Kokkos::View<T[2], Kokkos::HostSpace>(a));
-  Kokkos::sort(vd, /*force using Kokkos bin sort*/ true);
+  Kokkos::sort(vd);
   auto vh = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), vd);
   EXPECT_TRUE(std::is_sorted(vh.data(), vh.data() + 2))
       << "view (" << vh[0] << ", " << vh[1] << ") is not sorted";
@@ -407,7 +416,9 @@ void test_sort_integer_overflow() {
 template <class ExecutionSpace, typename KeyType>
 void test_1D_sort(unsigned int N) {
   test_1D_sort_impl<ExecutionSpace, KeyType>(N * N * N, true);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   test_1D_sort_impl<ExecutionSpace, KeyType>(N * N * N, false);
+#endif
 }
 
 template <class ExecutionSpace, typename KeyType>
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
index 4036112b4..d37f657f5 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp
@@ -44,7 +44,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
+#include <std_algorithms/Kokkos_AdjacentDifference.hpp>
 #include <utility>
 #include <numeric>
 
@@ -185,7 +185,7 @@ void verify_data(TestViewType test_view, GoldViewType gold) {
   const auto gold_h = create_mirror_view_and_copy(Kokkos::HostSpace(), gold);
 
   for (std::size_t i = 0; i < test_view.extent(0); ++i) {
-    EXPECT_TRUE(gold_h(i) == test_view_dc_h(i));
+    EXPECT_EQ(gold_h(i), test_view_dc_h(i));
   }
 }
 
@@ -225,7 +225,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto res1 = KE::adjacent_difference(exespace(), KE::cbegin(view_from),
                                         KE::cend(view_from),
                                         KE::begin(view_dest), args...);
-    EXPECT_TRUE(res1 == KE::end(view_dest));
+    EXPECT_EQ(res1, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -235,7 +235,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto res2 = KE::adjacent_difference(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), args...);
-    EXPECT_TRUE(res2 == KE::end(view_dest));
+    EXPECT_EQ(res2, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -244,7 +244,7 @@ void run_single_scenario(const InfoType& scenario_info,
         create_view<ValueType>(Tag{}, view_ext, "adj_diff_dest_view");
     auto res3 =
         KE::adjacent_difference(exespace(), view_from, view_dest, args...);
-    EXPECT_TRUE(res3 == KE::end(view_dest));
+    EXPECT_EQ(res3, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
@@ -253,7 +253,7 @@ void run_single_scenario(const InfoType& scenario_info,
         create_view<ValueType>(Tag{}, view_ext, "adj_diff_dest_view");
     auto res4 = KE::adjacent_difference("label", exespace(), view_from,
                                         view_dest, args...);
-    EXPECT_TRUE(res4 == KE::end(view_dest));
+    EXPECT_EQ(res4, KE::end(view_dest));
     verify_data(view_dest, gold);
   }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
index 6433a9cf6..874748193 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp
@@ -44,7 +44,7 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
+#include "std_algorithms/Kokkos_AdjacentFind.hpp"
 #include <utility>
 
 namespace Test {
@@ -257,7 +257,7 @@ void verify(DiffType my_diff, ViewType view, Args... args) {
       my_std_adjacent_find(KE::cbegin(view_h), KE::cend(view_h), args...);
   const auto std_diff = std_r - KE::cbegin(view_h);
 
-  EXPECT_TRUE(my_diff == std_diff);
+  EXPECT_EQ(my_diff, std_diff);
 }
 
 template <class Tag, class ValueType, class InfoType, class... Args>
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
index 65b600096..a1307d4c2 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAllAnyNoneOf.cpp
@@ -44,7 +44,9 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
+#include <std_algorithms/Kokkos_AllOf.hpp>
+#include <std_algorithms/Kokkos_AnyOf.hpp>
+#include <std_algorithms/Kokkos_NoneOf.hpp>
 #include <algorithm>
 
 namespace Test {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
index 6d2b65a56..a06f9c61c 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp
@@ -46,8 +46,9 @@
 #define KOKKOS_ALGORITHMS_UNITTESTS_TEST_STD_ALGOS_COMMON_HPP
 
 #include <gtest/gtest.h>
+#include <Kokkos_StdAlgorithms.hpp>
 #include <TestStdAlgorithmsHelperFunctors.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
+#include <utility>
 #include <numeric>
 #include <random>
 
@@ -249,6 +250,71 @@ struct std_algorithms_test : public ::testing::Test {
   }
 };
 
+struct CustomValueType {
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType(){};
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType(value_type val) : value(val){};
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType(const CustomValueType& other) { this->value = other.value; }
+
+  KOKKOS_INLINE_FUNCTION
+  explicit operator value_type() const { return value; }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& operator()() { return value; }
+
+  KOKKOS_INLINE_FUNCTION
+  const value_type& operator()() const { return value; }
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType& operator+=(const CustomValueType& other) {
+    this->value += other.value;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType& operator=(const CustomValueType& other) {
+    this->value = other.value;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType operator+(const CustomValueType& other) const {
+    CustomValueType result;
+    result.value = this->value + other.value;
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType operator-(const CustomValueType& other) const {
+    CustomValueType result;
+    result.value = this->value - other.value;
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CustomValueType operator*(const CustomValueType& other) const {
+    CustomValueType result;
+    result.value = this->value * other.value;
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator==(const CustomValueType& other) const {
+    return this->value == other.value;
+  }
+
+ private:
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const CustomValueType& custom_value_type) {
+    return os << custom_value_type.value;
+  }
+  value_type value = {};
+};
+
 }  // namespace stdalgos
 }  // namespace Test
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp
index 2f2172ecc..037dac36e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
 #include <Kokkos_StdAlgorithms.hpp>
 
 namespace Test {
@@ -61,12 +60,6 @@ struct TrivialBinaryFunctor {
   ValueType operator()(const ValueType &a, const ValueType &b) const {
     return (a + b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType &a,
-                       const volatile ValueType &b) const {
-    return (a + b);
-  }
 };
 
 template <class ValueType>
@@ -100,12 +93,6 @@ struct TrivialComparator {
   bool operator()(const ValueType &a, const ValueType &b) const {
     return a > b;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator()(const volatile ValueType &a,
-                  const volatile ValueType &b) const {
-    return a > b;
-  }
 };
 
 template <class ValueType>
@@ -120,12 +107,6 @@ struct TrivialReduceJoinFunctor {
   ValueType operator()(const ValueType &a, const ValueType &b) const {
     return a + b;
   }
-
-  KOKKOS_FUNCTION
-  ValueType operator()(const volatile ValueType &a,
-                       const volatile ValueType &b) const {
-    return a + b;
-  }
 };
 
 template <class ValueType>
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
index b1981df28..3eb13c98c 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp
@@ -44,7 +44,7 @@
 
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
-#include <std_algorithms/Kokkos_Constraints.hpp>
+#include <Kokkos_StdAlgorithms.hpp>
 
 namespace Test {
 namespace stdalgos {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
index f1d078bd7..d5758e243 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -165,49 +163,49 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(0));
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(0));
   }
 
   else if (name == "small-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(-4));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(-2));
-    EXPECT_TRUE(view_test_h(2) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(3) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(4) == static_cast<value_type>(4));
-    EXPECT_TRUE(view_test_h(5) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(6) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(7) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(8) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(-4));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-2));
+    EXPECT_EQ(view_test_h(2), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(3), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(4), static_cast<value_type>(4));
+    EXPECT_EQ(view_test_h(5), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(6), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(7), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(8), static_cast<value_type>(0));
   }
 
   else if (name == "small-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(2) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(3) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(4) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(5) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(6) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(7) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(8) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(9) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(10) == static_cast<value_type>(22));
-    EXPECT_TRUE(view_test_h(11) == static_cast<value_type>(-12));
-    EXPECT_TRUE(view_test_h(12) == static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(2), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(3), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(4), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(5), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(6), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(7), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(8), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(9), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(10), static_cast<value_type>(22));
+    EXPECT_EQ(view_test_h(11), static_cast<value_type>(-12));
+    EXPECT_EQ(view_test_h(12), static_cast<value_type>(22));
   }
 
   else if (name == "medium" || name == "large") {
@@ -220,13 +218,14 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
     std::size_t count = 0;
     for (std::size_t i = 0; i < view_from_h.extent(0); ++i) {
       if (pred(view_from_h(i))) {
-        EXPECT_TRUE(view_test_h(count++) == view_from_h(i));
+        EXPECT_EQ(view_test_h(count), view_from_h(i));
+        count++;
       }
     }
     // all other entries of test view should be zero
     for (; count < view_test_h.extent(0); ++count) {
       // std::cout << count << '\n';
-      EXPECT_TRUE(view_test_h(count) == value_type(0));
+      EXPECT_EQ(view_test_h(count), value_type(0));
     }
   }
 
@@ -255,7 +254,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit       = KE::copy_if(exespace(), KE::cbegin(view_from),
                            KE::cend(view_from), KE::begin(view_dest), pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -264,7 +263,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit       = KE::copy_if("label", exespace(), KE::cbegin(view_from),
                            KE::cend(view_from), KE::begin(view_dest), pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -272,7 +271,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest");
     auto rit       = KE::copy_if(exespace(), view_from, view_dest, pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -280,7 +279,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest");
     auto rit = KE::copy_if("label", exespace(), view_from, view_dest, pred);
     verify_data(name, view_from, view_dest, pred);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
index dfc7d794e..4c92a9905 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCount.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
 #include <algorithm>
 
 namespace Test {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsEqual.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsEqual.cpp
index 78edff423..e5b1e8514 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsEqual.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsEqual.cpp
@@ -43,9 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <algorithm>
 
 namespace Test {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
index 99c921323..e470ee862 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
 #include <utility>
 
 namespace Test {
@@ -78,10 +76,16 @@ struct UnifDist<int> {
   int operator()() { return m_dist(m_gen); }
 };
 
-template <class ViewType>
-void fill_zero(ViewType view) {
-  Kokkos::parallel_for(view.extent(0), FillZeroFunctor<ViewType>(view));
-}
+template <>
+struct UnifDist<CustomValueType> {
+  using dist_type = std::uniform_real_distribution<double>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist() : m_dist(0.05, 1.2) { m_gen.seed(1034343); }
+
+  CustomValueType operator()() { return m_dist(m_gen); }
+};
 
 template <class ViewType>
 void fill_view(ViewType dest_view, const std::string& name) {
@@ -181,15 +185,17 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << gold_h(i) << " " << test_view_h(i) << " "
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_TRUE(gold_h(i) == test_view_h(i));
+        EXPECT_EQ(gold_h(i), test_view_h(i));
       } else {
-        const auto error = std::abs(gold_h(i) - test_view_h(i));
+        const auto error =
+            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
         if (error > 1e-10) {
           std::cout << i << " " << std::setprecision(15) << data_view_h(i)
                     << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
+                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
+                    << std::endl;
         }
-        EXPECT_TRUE(error < 1e-10);
+        EXPECT_LT(error, 1e-10);
       }
     }
   }
@@ -201,12 +207,6 @@ struct MultiplyFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a * b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a * b);
-  }
 };
 
 template <class ValueType>
@@ -215,12 +215,6 @@ struct SumFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a + b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a + b);
-  }
 };
 
 std::string value_type_to_string(int) { return "int"; }
@@ -247,7 +241,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -256,14 +250,14 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -271,7 +265,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
                                 init_value);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, default_op());
   }
 
@@ -297,7 +291,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value, bop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -306,7 +300,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest),
                                 init_value, bop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -314,7 +308,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r =
         KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -322,7 +316,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest,
                                 init_value, bop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop);
   }
 
@@ -342,7 +336,7 @@ void run_exclusive_scan_all_scenarios() {
     run_single_scenario_default_op<Tag, ValueType>(it, ValueType{-2});
     run_single_scenario_default_op<Tag, ValueType>(it, ValueType{3});
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
     // custom multiply op is only run for small views otherwise it overflows
     if (it.first == "small-a" || it.first == "small-b") {
       using custom_bop_t = MultiplyFunctor<ValueType>;
@@ -374,6 +368,8 @@ TEST(std_algorithms_numeric_ops_test, exclusive_scan) {
   run_exclusive_scan_all_scenarios<StridedThreeTag, double>();
   run_exclusive_scan_all_scenarios<DynamicTag, int>();
   run_exclusive_scan_all_scenarios<StridedThreeTag, int>();
+  run_exclusive_scan_all_scenarios<DynamicTag, CustomValueType>();
+  run_exclusive_scan_all_scenarios<StridedThreeTag, CustomValueType>();
 }
 
 }  // namespace EScan
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
index 357e733dc..35b232e94 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFind.cpp
@@ -44,8 +44,6 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <iterator>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
 #include <algorithm>
 
 namespace Test {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
index b4685ced7..2a6d27185 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindEnd.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -312,7 +310,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
     // std::cout << "result : " << mydiff << " " << stddiff << std::endl;
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
@@ -321,21 +319,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                      KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::find_end(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::find_end("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
index bd6ea0300..84892bc37 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsFindFirstOf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -231,7 +229,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                           KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
@@ -240,21 +238,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                           KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::find_first_of(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::find_first_of("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
index c8cec00ed..79badc7c4 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
 #include <algorithm>
 
 namespace Test {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
index ef366c56e..882a6012e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp
@@ -156,12 +156,6 @@ struct CustomLessThanComparator {
     return a < b;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  bool operator()(const volatile ValueType1& a,
-                  const volatile ValueType1& b) const {
-    return a < b;
-  }
-
   KOKKOS_INLINE_FUNCTION
   CustomLessThanComparator() {}
 };
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
index 0f90623a3..173fbed66 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
 #include <utility>
 
 namespace Test {
@@ -78,10 +76,16 @@ struct UnifDist<int> {
   int operator()() { return m_dist(m_gen); }
 };
 
-template <class ViewType>
-void fill_zero(ViewType view) {
-  Kokkos::parallel_for(view.extent(0), FillZeroFunctor<ViewType>(view));
-}
+template <>
+struct UnifDist<CustomValueType> {
+  using dist_type = std::uniform_real_distribution<double>;
+  std::mt19937 m_gen;
+  dist_type m_dist;
+
+  UnifDist() : m_dist(0.05, 1.2) { m_gen.seed(1034343); }
+
+  CustomValueType operator()() { return m_dist(m_gen); }
+};
 
 template <class ViewType>
 void fill_view(ViewType dest_view, const std::string& name) {
@@ -195,15 +199,17 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_TRUE(gold_h(i) == test_view_h(i));
+        EXPECT_EQ(gold_h(i), test_view_h(i));
       } else {
-        const auto error = std::abs(gold_h(i) - test_view_h(i));
+        const auto error =
+            std::abs(static_cast<double>(gold_h(i) - test_view_h(i)));
         if (error > 1e-10) {
           std::cout << i << " " << std::setprecision(15) << data_view_h(i)
                     << " " << gold_h(i) << " " << test_view_h(i) << " "
-                    << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
+                    << std::abs(static_cast<double>(gold_h(i) - test_view_h(i)))
+                    << std::endl;
         }
-        EXPECT_TRUE(error < 1e-10);
+        EXPECT_LT(error, 1e-10);
       }
     }
     // std::cout << " last el: " << test_view_h(ext-1) << std::endl;
@@ -216,12 +222,6 @@ struct MultiplyFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a * b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a * b);
-  }
 };
 
 template <class ValueType>
@@ -230,12 +230,6 @@ struct SumFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a + b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a + b);
-  }
 };
 
 std::string value_type_to_string(int) { return "int"; }
@@ -258,7 +252,7 @@ void run_single_scenario_default_op(const InfoType& scenario_info) {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest));
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
@@ -266,21 +260,21 @@ void run_single_scenario_default_op(const InfoType& scenario_info) {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest));
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), view_from, view_dest);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, default_op());
   }
 
@@ -313,7 +307,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest), bop,
                                 args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -322,14 +316,14 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from),
                                 KE::cend(view_from), KE::begin(view_dest), bop,
                                 args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
   {
     fill_zero(view_dest);
     auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -337,7 +331,7 @@ void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop,
     fill_zero(view_dest);
     auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop,
                                 args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, bop, args...);
   }
 
@@ -354,7 +348,7 @@ void run_inclusive_scan_all_scenarios() {
   for (const auto& it : scenarios) {
     run_single_scenario_default_op<Tag, ValueType>(it);
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
     // the sum custom op is always run
     using sum_binary_op = SumFunctor<ValueType>;
     sum_binary_op sbop;
@@ -383,6 +377,8 @@ TEST(std_algorithms_numeric_ops_test, inclusive_scan) {
   run_inclusive_scan_all_scenarios<StridedThreeTag, double>();
   run_inclusive_scan_all_scenarios<DynamicTag, int>();
   run_inclusive_scan_all_scenarios<StridedThreeTag, int>();
+  run_inclusive_scan_all_scenarios<DynamicTag, CustomValueType>();
+  run_inclusive_scan_all_scenarios<StridedThreeTag, CustomValueType>();
 }
 
 }  // namespace IncScan
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
index acfb4c3f4..b0df93539 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_SortingOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -178,7 +176,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                 [=](bool v) { return v == gold; });
   EXPECT_TRUE(allA);
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
   CustomLessThanComparator<ValueType, ValueType> comp;
   std::vector<bool> resultsB(4);
   resultsB[0] =
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
index 3860fecfc..7f0071e24 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_SortingOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -175,12 +173,12 @@ void run_single_scenario(const InfoType& scenario_info) {
       KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view));
   auto r3 = KE::is_sorted_until(exespace(), view);
   auto r4 = KE::is_sorted_until("label", exespace(), view);
-  EXPECT_TRUE(r1 == gold);
-  EXPECT_TRUE(r2 == gold);
-  EXPECT_TRUE(r3 == gold);
-  EXPECT_TRUE(r4 == gold);
+  EXPECT_EQ(r1, gold);
+  EXPECT_EQ(r2, gold);
+  EXPECT_EQ(r3, gold);
+  EXPECT_EQ(r4, gold);
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
   CustomLessThanComparator<ValueType, ValueType> comp;
   auto r5 =
       KE::is_sorted_until(exespace(), KE::cbegin(view), KE::cend(view), comp);
@@ -190,10 +188,10 @@ void run_single_scenario(const InfoType& scenario_info) {
   auto r8 = KE::is_sorted_until("label", exespace(), view, comp);
 #endif
 
-  EXPECT_TRUE(r1 == gold);
-  EXPECT_TRUE(r2 == gold);
-  EXPECT_TRUE(r3 == gold);
-  EXPECT_TRUE(r4 == gold);
+  EXPECT_EQ(r1, gold);
+  EXPECT_EQ(r2, gold);
+  EXPECT_EQ(r3, gold);
+  EXPECT_EQ(r4, gold);
 
   Kokkos::fence();
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
index 8c0c3e4cc..8bfa51b4f 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp
@@ -43,9 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <algorithm>
 
 namespace Test {
@@ -172,7 +169,7 @@ void run_all_scenarios() {
 
 TEST(std_algorithms_lexicographical_compare_test, test) {
 // FIXME: should this disable only custom comparator tests?
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedTwoTag, int>();
   run_all_scenarios<StridedThreeTag, unsigned>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
index c13cdac0b..56819de8c 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp
@@ -43,7 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_MinMaxElementOperations.hpp>
 
 namespace KE = Kokkos::Experimental;
 
@@ -228,39 +227,39 @@ template <class ViewType>
 void test_max_element_trivial_data(ViewType view) {
   /* if we pass empty range, should return last */
   auto result = KE::max_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_TRUE(result == KE::cbegin(view));
+  EXPECT_EQ(result, KE::cbegin(view));
 
   /* if we pass empty range, should return last */
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::max_element(exespace(), it0, it1);
-  EXPECT_TRUE(result2 == it1);
+  EXPECT_EQ(result2, it1);
 }
 
 template <class ViewType>
 void test_min_element_trivial_data(ViewType view) {
   /* if we pass empty range, should return last */
   auto result = KE::min_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_TRUE(result == KE::cbegin(view));
+  EXPECT_EQ(result, KE::cbegin(view));
 
   /* if we pass empty range, should return last */
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::min_element(exespace(), it0, it1);
-  EXPECT_TRUE(result2 == it1);
+  EXPECT_EQ(result2, it1);
 }
 
 template <class ViewType>
 void test_minmax_element_empty_range(ViewType view) {
   auto result =
       KE::minmax_element(exespace(), KE::cbegin(view), KE::cbegin(view));
-  EXPECT_TRUE(result.first == KE::cbegin(view));
-  EXPECT_TRUE(result.second == KE::cbegin(view));
+  EXPECT_EQ(result.first, KE::cbegin(view));
+  EXPECT_EQ(result.second, KE::cbegin(view));
   auto it0     = KE::cbegin(view) + 3;
   auto it1     = it0;
   auto result2 = KE::minmax_element(exespace(), it0, it1);
-  EXPECT_TRUE(result2.first == it1);
-  EXPECT_TRUE(result2.second == it1);
+  EXPECT_EQ(result2.first, it1);
+  EXPECT_EQ(result2.second, it1);
 }
 
 template <class ViewType>
@@ -337,7 +336,7 @@ void std_algorithms_min_max_element_test::test_minmax_element_non_trivial_data(
   }
 }
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 template <class ViewType>
 void std_algorithms_min_max_element_test::
     test_max_element_non_trivial_data_custom_comp(ViewType view) {
@@ -446,7 +445,7 @@ TEST_F(std_algorithms_min_max_element_test, max_element_non_trivial_data) {
   test_max_element_non_trivial_data(m_strided_view);
 }
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 // non-trivial data, custom comp
 TEST_F(std_algorithms_min_max_element_test,
        min_element_non_trivial_data_custom_comp) {
@@ -478,7 +477,7 @@ TEST_F(std_algorithms_min_max_element_test, minmax_element_non_trivial_data) {
 }
 #endif
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 // OpenMPTarget does not yet support custom comparator
 TEST_F(std_algorithms_min_max_element_test,
        minmax_element_non_trivial_data_custom_comp) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
index f13fe071d..4bc4e018b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp
@@ -44,8 +44,6 @@
 
 #include <TestStdAlgorithmsCommon.hpp>
 #include <iterator>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_NonModifyingSequenceOperations.hpp>
 #include <algorithm>
 #include <numeric>
 
@@ -150,10 +148,10 @@ void run_single_scenario(ViewType view1, ViewType view2,
     const auto my_diff12 = my_res1.second - f2;
     const auto my_diff21 = my_res2.first - f1;
     const auto my_diff22 = my_res2.second - f2;
-    EXPECT_TRUE(my_diff11 == std_diff1);
-    EXPECT_TRUE(my_diff12 == std_diff2);
-    EXPECT_TRUE(my_diff21 == std_diff1);
-    EXPECT_TRUE(my_diff22 == std_diff2);
+    EXPECT_EQ(my_diff11, std_diff1);
+    EXPECT_EQ(my_diff12, std_diff2);
+    EXPECT_EQ(my_diff21, std_diff1);
+    EXPECT_EQ(my_diff22, std_diff2);
   }
 
   {
@@ -164,10 +162,10 @@ void run_single_scenario(ViewType view1, ViewType view2,
     const auto my_diff12 = my_res1.second - KE::begin(view2);
     const auto my_diff21 = my_res2.first - KE::begin(view1);
     const auto my_diff22 = my_res2.second - KE::begin(view2);
-    EXPECT_TRUE(my_diff11 == std_diff1);
-    EXPECT_TRUE(my_diff12 == std_diff2);
-    EXPECT_TRUE(my_diff21 == std_diff1);
-    EXPECT_TRUE(my_diff22 == std_diff2);
+    EXPECT_EQ(my_diff11, std_diff1);
+    EXPECT_EQ(my_diff12, std_diff2);
+    EXPECT_EQ(my_diff21, std_diff1);
+    EXPECT_EQ(my_diff22, std_diff2);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
index 44acb4773..8d4f60403 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp
@@ -43,7 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_ModifyingOperations.hpp>
 
 namespace Test {
 namespace stdalgos {
@@ -76,19 +75,19 @@ struct MyMovableType {
 
 TEST(std_algorithms_mod_ops_test, move) {
   MyMovableType a;
-  using move_t = decltype(KE::move(a));
+  using move_t = decltype(std::move(a));
   static_assert(std::is_rvalue_reference<move_t>::value, "");
 
   // move constr
-  MyMovableType b(KE::move(a));
-  EXPECT_TRUE(b.m_value == 11);
-  EXPECT_TRUE(a.m_value == -2);
+  MyMovableType b(std::move(a));
+  EXPECT_EQ(b.m_value, 11);
+  EXPECT_EQ(a.m_value, -2);
 
   // move assign
   MyMovableType c;
-  c = KE::move(b);
-  EXPECT_TRUE(c.m_value == 11);
-  EXPECT_TRUE(b.m_value == -4);
+  c = std::move(b);
+  EXPECT_EQ(c.m_value, 11);
+  EXPECT_EQ(b.m_value, -4);
 }
 
 template <class ViewType>
@@ -98,9 +97,9 @@ struct StdAlgoModSeqOpsTestMove {
   KOKKOS_INLINE_FUNCTION
   void operator()(const int index) const {
     typename ViewType::value_type a{11};
-    using move_t = decltype(KE::move(a));
+    using move_t = decltype(std::move(a));
     static_assert(std::is_rvalue_reference<move_t>::value, "");
-    m_view(index) = KE::move(a);
+    m_view(index) = std::move(a);
   }
 
   StdAlgoModSeqOpsTestMove(ViewType view) : m_view(view) {}
@@ -126,8 +125,8 @@ TEST(std_algorithms_mod_ops_test, swap) {
     int a = 1;
     int b = 2;
     KE::swap(a, b);
-    EXPECT_TRUE(a == 2);
-    EXPECT_TRUE(b == 1);
+    EXPECT_EQ(a, 2);
+    EXPECT_EQ(b, 1);
   }
 
   {
@@ -180,17 +179,17 @@ void test_iter_swap(ViewType view) {
   using value_type = typename ViewType::value_type;
   auto a_dc        = create_deep_copyable_compatible_clone(view);
   auto a_h         = create_mirror_view_and_copy(Kokkos::HostSpace(), a_dc);
-  EXPECT_TRUE(view.extent(0) == 10);
-  EXPECT_TRUE(a_h(0) == value_type(3));
-  EXPECT_TRUE(a_h(1) == value_type(1));
-  EXPECT_TRUE(a_h(2) == value_type(2));
-  EXPECT_TRUE(a_h(3) == value_type(0));
-  EXPECT_TRUE(a_h(4) == value_type(6));
-  EXPECT_TRUE(a_h(5) == value_type(5));
-  EXPECT_TRUE(a_h(6) == value_type(4));
-  EXPECT_TRUE(a_h(7) == value_type(7));
-  EXPECT_TRUE(a_h(8) == value_type(8));
-  EXPECT_TRUE(a_h(9) == value_type(9));
+  EXPECT_EQ(view.extent_int(0), 10);
+  EXPECT_EQ(a_h(0), value_type(3));
+  EXPECT_EQ(a_h(1), value_type(1));
+  EXPECT_EQ(a_h(2), value_type(2));
+  EXPECT_EQ(a_h(3), value_type(0));
+  EXPECT_EQ(a_h(4), value_type(6));
+  EXPECT_EQ(a_h(5), value_type(5));
+  EXPECT_EQ(a_h(6), value_type(4));
+  EXPECT_EQ(a_h(7), value_type(7));
+  EXPECT_EQ(a_h(8), value_type(8));
+  EXPECT_EQ(a_h(9), value_type(9));
 }
 
 TEST(std_algorithms_mod_ops_test, iter_swap_static_view) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
index 3a8883d48..1e3960c5e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
-#include "std_algorithms/Kokkos_BeginEnd.hpp"
 
 namespace KE = Kokkos::Experimental;
 
@@ -390,16 +388,16 @@ void test_swap_ranges(ViewType view) {
   parallel_for(ext, cp_func_a_t(view, checkViewA));
   auto cvA_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewA);
-  EXPECT_TRUE(cvA_h(0) == 0);
-  EXPECT_TRUE(cvA_h(1) == 1);
-  EXPECT_TRUE(cvA_h(2) == 99);
-  EXPECT_TRUE(cvA_h(3) == 98);
-  EXPECT_TRUE(cvA_h(4) == 97);
-  EXPECT_TRUE(cvA_h(5) == 96);
-  EXPECT_TRUE(cvA_h(6) == 6);
-  EXPECT_TRUE(cvA_h(7) == 7);
-  EXPECT_TRUE(cvA_h(8) == 8);
-  EXPECT_TRUE(cvA_h(9) == 9);
+  EXPECT_EQ(cvA_h(0), 0);
+  EXPECT_EQ(cvA_h(1), 1);
+  EXPECT_EQ(cvA_h(2), 99);
+  EXPECT_EQ(cvA_h(3), 98);
+  EXPECT_EQ(cvA_h(4), 97);
+  EXPECT_EQ(cvA_h(5), 96);
+  EXPECT_EQ(cvA_h(6), 6);
+  EXPECT_EQ(cvA_h(7), 7);
+  EXPECT_EQ(cvA_h(8), 8);
+  EXPECT_EQ(cvA_h(9), 9);
 
   /* check viewB */
   static_view_type checkViewB("tmpB");
@@ -407,16 +405,16 @@ void test_swap_ranges(ViewType view) {
   Kokkos::parallel_for(ext, cp_func_b_t(viewB, checkViewB));
   auto cvB_h =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), checkViewB);
-  EXPECT_TRUE(cvB_h(0) == 100);
-  EXPECT_TRUE(cvB_h(1) == 2);
-  EXPECT_TRUE(cvB_h(2) == 3);
-  EXPECT_TRUE(cvB_h(3) == 4);
-  EXPECT_TRUE(cvB_h(4) == 5);
-  EXPECT_TRUE(cvB_h(5) == 95);
-  EXPECT_TRUE(cvB_h(6) == 94);
-  EXPECT_TRUE(cvB_h(7) == 93);
-  EXPECT_TRUE(cvB_h(8) == 92);
-  EXPECT_TRUE(cvB_h(9) == 91);
+  EXPECT_EQ(cvB_h(0), 100);
+  EXPECT_EQ(cvB_h(1), 2);
+  EXPECT_EQ(cvB_h(2), 3);
+  EXPECT_EQ(cvB_h(3), 4);
+  EXPECT_EQ(cvB_h(4), 5);
+  EXPECT_EQ(cvB_h(5), 95);
+  EXPECT_EQ(cvB_h(6), 94);
+  EXPECT_EQ(cvB_h(7), 93);
+  EXPECT_EQ(cvB_h(8), 92);
+  EXPECT_EQ(cvB_h(9), 91);
 }
 
 TEST_F(std_algorithms_mod_seq_ops_test, swap_ranges) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
new file mode 100644
index 000000000..002d35466
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp
@@ -0,0 +1,135 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestStdAlgorithmsCommon.hpp>
+#include <utility>
+#include <Kokkos_Random.hpp>
+
+namespace Test {
+namespace stdalgos {
+namespace MoveBackward {
+
+namespace KE = Kokkos::Experimental;
+
+template <class Tag, class ValueType, class InfoType>
+void run_single_scenario(const InfoType& scenario_info, int apiId) {
+  const std::size_t view_ext = std::get<1>(scenario_info);
+
+  auto v = create_view<ValueType>(Tag{}, view_ext, "v");
+
+  // v might not be deep copyable so to modify it on the host
+  // need to do all this
+  auto v_dc   = create_deep_copyable_compatible_view_with_same_extent(v);
+  auto v_dc_h = create_mirror_view(Kokkos::HostSpace(), v_dc);
+  Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(12371);
+  Kokkos::fill_random(v_dc_h, pool, 0, 523);
+  // copy to v_dc and then to v
+  Kokkos::deep_copy(v_dc, v_dc_h);
+  CopyFunctor<decltype(v_dc), decltype(v)> F1(v_dc, v);
+  Kokkos::parallel_for("copy", v.extent(0), F1);
+
+  // make a gold copy of v before calling the algorithm
+  // since the algorithm will modify v
+  auto gold = create_host_space_copy(v);
+
+  // create another view that is bigger than v
+  // because we need it to test the move_backward
+  auto v2 = create_view<ValueType>(Tag{}, view_ext + 5, "v2");
+
+  if (apiId == 0) {
+    auto rit =
+        KE::move_backward(exespace(), KE::begin(v), KE::end(v), KE::end(v2));
+    const int dist = KE::distance(KE::begin(v2), rit);
+    EXPECT_EQ(dist, 5);
+  } else if (apiId == 1) {
+    auto rit       = KE::move_backward("mylabel", exespace(), KE::begin(v),
+                                 KE::end(v), KE::end(v2));
+    const int dist = KE::distance(KE::begin(v2), rit);
+    EXPECT_EQ(dist, 5);
+  } else if (apiId == 2) {
+    auto rit       = KE::move_backward(exespace(), v, v2);
+    const int dist = KE::distance(KE::begin(v2), rit);
+    EXPECT_EQ(dist, 5);
+  } else if (apiId == 3) {
+    auto rit       = KE::move_backward("mylabel", exespace(), v, v2);
+    const int dist = KE::distance(KE::begin(v2), rit);
+    EXPECT_EQ(dist, 5);
+  }
+
+  // check
+  auto v2_h = create_host_space_copy(v2);
+  for (std::size_t j = 0; j < v2_h.extent(1); ++j) {
+    if (j < 5) {
+      EXPECT_TRUE(v2_h(j) == static_cast<ValueType>(0));
+    } else {
+      EXPECT_TRUE(gold(j - 5) == v2_h(j));
+    }
+  }
+}
+
+template <class Tag, class ValueType>
+void run_all_scenarios() {
+  const std::map<std::string, std::size_t> scenarios = {
+      {"empty", 0},          {"one-element-a", 1},  {"one-element-b", 1},
+      {"two-elements-a", 2}, {"two-elements-b", 2}, {"small-a", 9},
+      {"small-b", 13},       {"medium", 1103},      {"large", 101513}};
+
+  for (const auto& it : scenarios) {
+    run_single_scenario<Tag, ValueType>(it, 0);
+    run_single_scenario<Tag, ValueType>(it, 1);
+    run_single_scenario<Tag, ValueType>(it, 2);
+    run_single_scenario<Tag, ValueType>(it, 3);
+  }
+}
+
+TEST(std_algorithms_mod_seq_ops, move_backward) {
+  run_all_scenarios<DynamicTag, int>();
+  run_all_scenarios<DynamicTag, double>();
+  run_all_scenarios<StridedThreeTag, int>();
+  run_all_scenarios<StridedThreeTag, double>();
+}
+
+}  // namespace MoveBackward
+}  // namespace stdalgos
+}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
index 51f74220c..0ea5fcc99 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp
@@ -43,87 +43,12 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
 
 namespace KE = Kokkos::Experimental;
 
 namespace Test {
 namespace stdalgos {
 
-struct CustomValueType {
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType(){};
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType(value_type val) : value(val){};
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType(const CustomValueType& other) { this->value = other.value; }
-
-  KOKKOS_INLINE_FUNCTION
-  value_type& operator()() { return value; }
-
-  KOKKOS_INLINE_FUNCTION
-  const value_type& operator()() const { return value; }
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType& operator+=(const CustomValueType& other) {
-    this->value += other.value;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType& operator=(const CustomValueType& other) {
-    this->value = other.value;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType operator+(const CustomValueType& other) const {
-    CustomValueType result;
-    result.value = this->value + other.value;
-    return result;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType operator*(const CustomValueType& other) const {
-    CustomValueType result;
-    result.value = this->value * other.value;
-    return result;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  bool operator==(const CustomValueType& other) const {
-    return this->value == other.value;
-  }
-
-  //
-  // volatile overloads needed for the kokkos reductions
-  //
-  // note the void return
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile CustomValueType& other) volatile {
-    this->value += other.value;
-  }
-
-  // note the void return
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const CustomValueType& other) volatile {
-    this->value = other.value;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CustomValueType operator+(const volatile CustomValueType& other) const
-      volatile {
-    CustomValueType result;
-    result.value = this->value + other.value;
-    return result;
-  }
-
- private:
-  value_type value = {};
-};
-
 template <class ValueType>
 struct TimesTwoUnaryTransformFunctor {
   KOKKOS_INLINE_FUNCTION
@@ -144,12 +69,6 @@ struct SumJoinFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return a + b;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return a + b;
-  }
 };
 
 struct std_algorithms_numerics_test : public ::testing::Test {
@@ -239,7 +158,7 @@ struct std_algorithms_numerics_test : public ::testing::Test {
   }
 };
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 
 // -------------------------------------------------------------------
 // test default case of transform_reduce
@@ -260,8 +179,8 @@ void run_and_check_transform_reduce_default(ViewType1 first_view,
   const auto r2 = KE::transform_reduce(
       "MYLABEL", ExecutionSpace(), KE::cbegin(first_view),
       KE::cbegin(first_view), KE::cbegin(second_view), init_value);
-  EXPECT_TRUE(r1 == init_value);
-  EXPECT_TRUE(r2 == init_value);
+  EXPECT_EQ(r1, init_value);
+  EXPECT_EQ(r2, init_value);
 
   // non-trivial cases
   const auto r3 = KE::transform_reduce(ExecutionSpace(), KE::cbegin(first_view),
@@ -277,10 +196,10 @@ void run_and_check_transform_reduce_default(ViewType1 first_view,
   const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view,
                                        second_view, init_value);
 
-  EXPECT_TRUE(r3 == result_value);
-  EXPECT_TRUE(r4 == result_value);
-  EXPECT_TRUE(r5 == result_value);
-  EXPECT_TRUE(r6 == result_value);
+  EXPECT_EQ(r3, result_value);
+  EXPECT_EQ(r4, result_value);
+  EXPECT_EQ(r5, result_value);
+  EXPECT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -363,8 +282,8 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view,
                            KE::cbegin(first_view), KE::cbegin(second_view),
                            init_value, std::forward<Args>(args)...);
 
-  EXPECT_TRUE(r1 == init_value);
-  EXPECT_TRUE(r2 == init_value);
+  EXPECT_EQ(r1, init_value);
+  EXPECT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::transform_reduce(
@@ -382,10 +301,10 @@ void run_and_check_transform_reduce_overloadA(ViewType1 first_view,
       KE::transform_reduce("MYLABEL", ExecutionSpace(), first_view, second_view,
                            init_value, std::forward<Args>(args)...);
 
-  EXPECT_TRUE(r3 == result_value);
-  EXPECT_TRUE(r4 == result_value);
-  EXPECT_TRUE(r5 == result_value);
-  EXPECT_TRUE(r6 == result_value);
+  EXPECT_EQ(r3, result_value);
+  EXPECT_EQ(r4, result_value);
+  EXPECT_EQ(r5, result_value);
+  EXPECT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -482,8 +401,8 @@ void run_and_check_transform_reduce_overloadB(ViewType view,
                                        KE::cbegin(view), KE::cbegin(view),
                                        init_value, std::forward<Args>(args)...);
 
-  EXPECT_TRUE(r1 == init_value);
-  EXPECT_TRUE(r2 == init_value);
+  EXPECT_EQ(r1, init_value);
+  EXPECT_EQ(r2, init_value);
 
   // non trivial
   const auto r3 =
@@ -499,10 +418,10 @@ void run_and_check_transform_reduce_overloadB(ViewType view,
   const auto r6 = KE::transform_reduce("MYLABEL", ExecutionSpace(), view,
                                        init_value, std::forward<Args>(args)...);
 
-  EXPECT_TRUE(r3 == result_value);
-  EXPECT_TRUE(r4 == result_value);
-  EXPECT_TRUE(r5 == result_value);
-  EXPECT_TRUE(r6 == result_value);
+  EXPECT_EQ(r3, result_value);
+  EXPECT_EQ(r4, result_value);
+  EXPECT_EQ(r5, result_value);
+  EXPECT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -556,8 +475,8 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result,
       KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cbegin(view));
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view));
-  EXPECT_TRUE(r1 == trivial_result);
-  EXPECT_TRUE(r2 == trivial_result);
+  EXPECT_EQ(r1, trivial_result);
+  EXPECT_EQ(r2, trivial_result);
 
   // non trivial cases
   const auto r3 =
@@ -567,10 +486,10 @@ void run_and_check_reduce_overloadA(ViewType view, ValueType non_trivial_result,
   const auto r5 = KE::reduce(ExecutionSpace(), view);
   const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view);
 
-  EXPECT_TRUE(r3 == non_trivial_result);
-  EXPECT_TRUE(r4 == non_trivial_result);
-  EXPECT_TRUE(r5 == non_trivial_result);
-  EXPECT_TRUE(r6 == non_trivial_result);
+  EXPECT_EQ(r3, non_trivial_result);
+  EXPECT_EQ(r4, non_trivial_result);
+  EXPECT_EQ(r5, non_trivial_result);
+  EXPECT_EQ(r6, non_trivial_result);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -612,8 +531,8 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value,
                              KE::cbegin(view), init_value);
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view), init_value);
-  EXPECT_TRUE(r1 == init_value);
-  EXPECT_TRUE(r2 == init_value);
+  EXPECT_EQ(r1, init_value);
+  EXPECT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view),
@@ -623,10 +542,10 @@ void run_and_check_reduce_overloadB(ViewType view, ValueType result_value,
   const auto r5 = KE::reduce(ExecutionSpace(), view, init_value);
   const auto r6 = KE::reduce("MYLABEL", ExecutionSpace(), view, init_value);
 
-  EXPECT_TRUE(r3 == result_value);
-  EXPECT_TRUE(r4 == result_value);
-  EXPECT_TRUE(r5 == result_value);
-  EXPECT_TRUE(r6 == result_value);
+  EXPECT_EQ(r3, result_value);
+  EXPECT_EQ(r4, result_value);
+  EXPECT_EQ(r5, result_value);
+  EXPECT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
@@ -662,8 +581,8 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value,
                              KE::cbegin(view), init_value, joiner);
   const auto r2 = KE::reduce("MYLABEL", ExecutionSpace(), KE::cbegin(view),
                              KE::cbegin(view), init_value, joiner);
-  EXPECT_TRUE(r1 == init_value);
-  EXPECT_TRUE(r2 == init_value);
+  EXPECT_EQ(r1, init_value);
+  EXPECT_EQ(r2, init_value);
 
   // non trivial cases
   const auto r3 = KE::reduce(ExecutionSpace(), KE::cbegin(view), KE::cend(view),
@@ -674,10 +593,10 @@ void run_and_check_reduce_overloadC(ViewType view, ValueType result_value,
   const auto r6 =
       KE::reduce("MYLABEL", ExecutionSpace(), view, init_value, joiner);
 
-  EXPECT_TRUE(r3 == result_value);
-  EXPECT_TRUE(r4 == result_value);
-  EXPECT_TRUE(r5 == result_value);
-  EXPECT_TRUE(r6 == result_value);
+  EXPECT_EQ(r3, result_value);
+  EXPECT_EQ(r4, result_value);
+  EXPECT_EQ(r5, result_value);
+  EXPECT_EQ(r6, result_value);
 }
 
 TEST_F(std_algorithms_numerics_test,
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
index 3fb4f9d15..a461f2751 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_PartitioningOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -160,12 +158,12 @@ void verify_data(const std::string& name, ResultType my_result,
   const std::size_t my_diff_true = my_result.first - KE::begin(view_dest_true);
   const std::size_t my_diff_false =
       my_result.second - KE::begin(view_dest_false);
-  EXPECT_TRUE(std_diff_true == my_diff_true);
-  EXPECT_TRUE(std_diff_false == my_diff_false);
+  EXPECT_EQ(std_diff_true, my_diff_true);
+  EXPECT_EQ(std_diff_false, my_diff_false);
 
   auto view_dest_true_h = create_host_space_copy(view_dest_true);
   for (std::size_t i = 0; i < std_diff_true; ++i) {
-    EXPECT_TRUE(std_vec_true[i] == view_dest_true_h(i));
+    EXPECT_EQ(std_vec_true[i], view_dest_true_h(i));
     // std::cout << "i= " << i << " "
     // 	      << " std_true = " << std_vec_true[i] << " "
     // 	      << " mine     = " << view_dest_true_h(i) << '\n';
@@ -173,45 +171,45 @@ void verify_data(const std::string& name, ResultType my_result,
 
   auto view_dest_false_h = create_host_space_copy(view_dest_false);
   for (std::size_t i = 0; i < std_diff_false; ++i) {
-    EXPECT_TRUE(std_vec_false[i] == view_dest_false_h(i));
+    EXPECT_EQ(std_vec_false[i], view_dest_false_h(i));
     // std::cout << "i= " << i << " "
     // 	      << " std_false = " << std_vec_false[i] << " "
     // 	      << " mine     = " << view_dest_false_h(i) << '\n';
   }
 
   if (name == "empty") {
-    EXPECT_TRUE(my_diff_true == 0);
-    EXPECT_TRUE(my_diff_false == 0);
+    EXPECT_EQ(my_diff_true, 0u);
+    EXPECT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(my_diff_true == 0);
-    EXPECT_TRUE(my_diff_false == 1);
+    EXPECT_EQ(my_diff_true, 0u);
+    EXPECT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(my_diff_true == 1);
-    EXPECT_TRUE(my_diff_false == 0);
+    EXPECT_EQ(my_diff_true, 1u);
+    EXPECT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(my_diff_true == 1);
-    EXPECT_TRUE(my_diff_false == 1);
+    EXPECT_EQ(my_diff_true, 1u);
+    EXPECT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(my_diff_true == 1);
-    EXPECT_TRUE(my_diff_false == 1);
+    EXPECT_EQ(my_diff_true, 1u);
+    EXPECT_EQ(my_diff_false, 1u);
   }
 
   else if (name == "small-b") {
-    EXPECT_TRUE(my_diff_true == 13);
-    EXPECT_TRUE(my_diff_false == 0);
+    EXPECT_EQ(my_diff_true, 13u);
+    EXPECT_EQ(my_diff_false, 0u);
   }
 
   else if (name == "small-c") {
-    EXPECT_TRUE(my_diff_true == 0);
-    EXPECT_TRUE(my_diff_false == 15);
+    EXPECT_EQ(my_diff_true, 0u);
+    EXPECT_EQ(my_diff_false, 15u);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
index 990d41ead..0d4615155 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitioningOps.cpp
@@ -43,7 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_PartitioningOperations.hpp>
 
 namespace KE = Kokkos::Experimental;
 
@@ -180,15 +179,15 @@ TEST_F(std_algorithms_partitioning_test, is_partitioned_trivial) {
   IsNegativeFunctor<value_type> p;
   const auto result1 = KE::is_partitioned(exespace(), KE::cbegin(m_static_view),
                                           KE::cbegin(m_static_view), p);
-  EXPECT_EQ(true, result1);
+  EXPECT_TRUE(result1);
 
   const auto result2 = KE::is_partitioned(
       exespace(), KE::cbegin(m_dynamic_view), KE::cbegin(m_dynamic_view), p);
-  EXPECT_EQ(true, result2);
+  EXPECT_TRUE(result2);
 
   const auto result3 = KE::is_partitioned(
       exespace(), KE::cbegin(m_strided_view), KE::cbegin(m_strided_view), p);
-  EXPECT_EQ(true, result3);
+  EXPECT_TRUE(result3);
 }
 
 TEST_F(std_algorithms_partitioning_test, is_partitioned_accepting_iterators) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
index 0cd931d87..8f345f044 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -147,12 +145,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - KE::begin(view_data_h);
   const std::size_t my_diff  = my_result - KE::begin(view_test);
-  EXPECT_TRUE(std_diff == my_diff);
+  EXPECT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_test_h = create_host_space_copy(view_test);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_TRUE(view_test_h(i) == view_data_h[i]);
+    EXPECT_EQ(view_test_h(i), view_data_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_test_h(i) << " "
     // 	      << "std: " << view_data_h(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
index 0b2de8147..bb7d0b52b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -165,12 +163,12 @@ void verify_data(ViewFromType view_from, ViewDestType view_dest,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - gold_dest_std.begin();
   const std::size_t my_diff  = my_result - KE::begin(view_dest);
-  EXPECT_TRUE(std_diff == my_diff);
+  EXPECT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_dest_h = create_host_space_copy(view_dest);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_TRUE(view_dest_h(i) == gold_dest_std[i]);
+    EXPECT_EQ(view_dest_h(i), gold_dest_std[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_dest_h(i) << " "
     // 	      << "std: " << gold_dest_std[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
index 0c20b6b0a..b209b88ea 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -149,12 +147,12 @@ void verify_data(ViewTypeFrom view_from, ViewTypeDest view_dest,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - gold_dest_std.begin();
   const std::size_t my_diff  = my_result - KE::begin(view_dest);
-  EXPECT_TRUE(std_diff == my_diff);
+  EXPECT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_dest_h = create_host_space_copy(view_dest);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_TRUE(view_dest_h(i) == gold_dest_std[i]);
+    EXPECT_EQ(view_dest_h(i), gold_dest_std[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_dest_h(i) << " "
     // 	      << "std: " << gold_dest_std[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
index 05c86690a..f1f232369 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -142,12 +140,12 @@ void verify_data(ViewTypeData view_data_h, ViewTypeTest view_test,
   // check that returned iterators are correct
   const std::size_t std_diff = std_result - KE::begin(view_data_h);
   const std::size_t my_diff  = my_result - KE::begin(view_test);
-  EXPECT_TRUE(std_diff == my_diff);
+  EXPECT_EQ(std_diff, my_diff);
 
   // check the actual data after algo has been applied
   auto view_test_h = create_host_space_copy(view_test);
   for (std::size_t i = 0; i < my_diff; ++i) {
-    EXPECT_TRUE(view_test_h(i) == view_data_h[i]);
+    EXPECT_EQ(view_test_h(i), view_data_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_test_h(i) << " "
     // 	      << "std: " << view_data_h(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
index 14ee73376..f044d975a 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -134,30 +132,30 @@ void verify_data(const std::string& name, ViewType1 test_view,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(view_h(0) == ValueType{1});
+    EXPECT_EQ(view_h(0), ValueType{1});
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(view_h(0) == new_value);
+    EXPECT_EQ(view_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(view_h(0) == ValueType{1});
-    EXPECT_TRUE(view_h(1) == new_value);
+    EXPECT_EQ(view_h(0), ValueType{1});
+    EXPECT_EQ(view_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(view_h(0) == new_value);
-    EXPECT_TRUE(view_h(1) == ValueType{-1});
+    EXPECT_EQ(view_h(0), new_value);
+    EXPECT_EQ(view_h(1), ValueType{-1});
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_TRUE(view_h(i) == new_value);
+        EXPECT_EQ(view_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_TRUE(view_h(i) == gold);
+        EXPECT_EQ(view_h(i), gold);
       }
     }
   }
@@ -165,9 +163,9 @@ void verify_data(const std::string& name, ViewType1 test_view,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_TRUE(view_h(i) == ValueType{-1});
+        EXPECT_EQ(view_h(i), ValueType{-1});
       } else {
-        EXPECT_TRUE(view_h(i) == new_value);
+        EXPECT_EQ(view_h(i), new_value);
       }
     }
   }
@@ -175,9 +173,9 @@ void verify_data(const std::string& name, ViewType1 test_view,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_TRUE(view_h(i) == ValueType{-1});
+        EXPECT_EQ(view_h(i), ValueType{-1});
       } else {
-        EXPECT_TRUE(view_h(i) == new_value);
+        EXPECT_EQ(view_h(i), new_value);
       }
     }
   }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
index 1e7f48067..682622cc1 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -142,40 +140,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{1});
-    EXPECT_TRUE(view_test_h(0) == view_from_h(0));
+    EXPECT_EQ(view_from_h(0), ValueType{1});
+    EXPECT_EQ(view_test_h(0), view_from_h(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{2});
-    EXPECT_TRUE(view_test_h(0) == new_value);
+    EXPECT_EQ(view_from_h(0), ValueType{2});
+    EXPECT_EQ(view_test_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{1});
-    EXPECT_TRUE(view_from_h(1) == ValueType{2});
+    EXPECT_EQ(view_from_h(0), ValueType{1});
+    EXPECT_EQ(view_from_h(1), ValueType{2});
 
-    EXPECT_TRUE(view_test_h(0) == view_from_h(0));
-    EXPECT_TRUE(view_test_h(1) == new_value);
+    EXPECT_EQ(view_test_h(0), view_from_h(0));
+    EXPECT_EQ(view_test_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{2});
-    EXPECT_TRUE(view_from_h(1) == ValueType{-1});
+    EXPECT_EQ(view_from_h(0), ValueType{2});
+    EXPECT_EQ(view_from_h(1), ValueType{-1});
 
-    EXPECT_TRUE(view_test_h(0) == new_value);
-    EXPECT_TRUE(view_test_h(1) == view_from_h(1));
+    EXPECT_EQ(view_test_h(0), new_value);
+    EXPECT_EQ(view_test_h(1), view_from_h(1));
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_TRUE(view_from_h(i) == gold);
-        EXPECT_TRUE(view_test_h(i) == gold);
+        EXPECT_EQ(view_from_h(i), gold);
+        EXPECT_EQ(view_test_h(i), gold);
       }
     }
   }
@@ -183,11 +181,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{-1});
-        EXPECT_TRUE(view_test_h(i) == view_from_h(i));
+        EXPECT_EQ(view_from_h(i), ValueType{-1});
+        EXPECT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -195,11 +193,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{-1});
-        EXPECT_TRUE(view_test_h(i) == view_from_h(i));
+        EXPECT_EQ(view_from_h(i), ValueType{-1});
+        EXPECT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -232,7 +230,7 @@ void run_single_scenario(const InfoType& scenario_info) {
         KE::replace_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from),
                          KE::begin(view_dest), old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -245,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                 KE::cend(view_from), KE::begin(view_dest),
                                 old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -257,7 +255,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy(exespace(), view_from, view_dest, old_value,
                                 new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -269,7 +267,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy("label", exespace(), view_from, view_dest,
                                 old_value, new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
index cb98aac08..c2ba66e92 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -142,40 +140,40 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{1});
-    EXPECT_TRUE(view_test_h(0) == view_from_h(0));
+    EXPECT_EQ(view_from_h(0), ValueType{1});
+    EXPECT_EQ(view_test_h(0), view_from_h(0));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{2});
-    EXPECT_TRUE(view_test_h(0) == new_value);
+    EXPECT_EQ(view_from_h(0), ValueType{2});
+    EXPECT_EQ(view_test_h(0), new_value);
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{1});
-    EXPECT_TRUE(view_from_h(1) == ValueType{2});
+    EXPECT_EQ(view_from_h(0), ValueType{1});
+    EXPECT_EQ(view_from_h(1), ValueType{2});
 
-    EXPECT_TRUE(view_test_h(0) == view_from_h(0));
-    EXPECT_TRUE(view_test_h(1) == new_value);
+    EXPECT_EQ(view_test_h(0), view_from_h(0));
+    EXPECT_EQ(view_test_h(1), new_value);
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(view_from_h(0) == ValueType{2});
-    EXPECT_TRUE(view_from_h(1) == ValueType{-1});
+    EXPECT_EQ(view_from_h(0), ValueType{2});
+    EXPECT_EQ(view_from_h(1), ValueType{-1});
 
-    EXPECT_TRUE(view_test_h(0) == new_value);
-    EXPECT_TRUE(view_test_h(1) == view_from_h(1));
+    EXPECT_EQ(view_test_h(0), new_value);
+    EXPECT_EQ(view_test_h(1), view_from_h(1));
   }
 
   else if (name == "small-a") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i == 0 || i == 3 || i == 5 || i == 6) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       } else {
         const auto gold = ValueType{-5} + static_cast<ValueType>(i + 1);
-        EXPECT_TRUE(view_from_h(i) == gold);
-        EXPECT_TRUE(view_test_h(i) == gold);
+        EXPECT_EQ(view_from_h(i), gold);
+        EXPECT_EQ(view_test_h(i), gold);
       }
     }
   }
@@ -183,11 +181,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "small-b") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i < 4) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{-1});
-        EXPECT_TRUE(view_test_h(i) == view_from_h(i));
+        EXPECT_EQ(view_from_h(i), ValueType{-1});
+        EXPECT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -195,11 +193,11 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   else if (name == "medium" || name == "large") {
     for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
       if (i % 2 == 0) {
-        EXPECT_TRUE(view_from_h(i) == ValueType{-1});
-        EXPECT_TRUE(view_test_h(i) == view_from_h(i));
+        EXPECT_EQ(view_from_h(i), ValueType{-1});
+        EXPECT_EQ(view_test_h(i), view_from_h(i));
       } else {
-        EXPECT_TRUE(view_from_h(i) == ValueType{2});
-        EXPECT_TRUE(view_test_h(i) == new_value);
+        EXPECT_EQ(view_from_h(i), ValueType{2});
+        EXPECT_EQ(view_test_h(i), new_value);
       }
     }
   }
@@ -239,7 +237,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                    KE::cend(view_from), KE::begin(view_dest),
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -250,7 +248,7 @@ void run_single_scenario(const InfoType& scenario_info) {
                                    KE::cend(view_from), KE::begin(view_dest),
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -260,7 +258,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy_if(exespace(), view_from, view_dest,
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -270,7 +268,7 @@ void run_single_scenario(const InfoType& scenario_info) {
     auto rit = KE::replace_copy_if("label", exespace(), view_from, view_dest,
                                    pred_type(), new_value);
     verify_data(name, view_from, view_dest, new_value);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
index 8c4d04889..7237e2955 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -168,7 +166,7 @@ void verify_data(ViewType1 data_view,  // contains data
       // 		<< data_view_dc(i) << " "
       // 		<< data_view_h(i) << " "
       // 		<< test_view_h(i) << std::endl;
-      EXPECT_TRUE(data_view_h(i) == test_view_h(i));
+      EXPECT_EQ(data_view_h(i), test_view_h(i));
     }
   }
 }
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
index 77c80ed02..f8c81dc10 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -107,7 +105,7 @@ void verify_data(ViewType1 test_view, ViewType2 orig_view) {
 
   const std::size_t ext = test_view.extent(0);
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_TRUE(tv_h(i) == ov_h(ext - i - 1));
+    EXPECT_EQ(tv_h(i), ov_h(ext - i - 1));
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
index 49d40115c..bbf273970 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -166,13 +164,13 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = result_it - KE::begin(view);
   const auto std_diff = std_rit - KE::begin(data_view_host);
-  EXPECT_TRUE(my_diff == std_diff);
+  EXPECT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h           = create_host_space_copy(view);
   const std::size_t ext = view_h.extent(0);
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_TRUE(view_h(i) == data_view_host[i]);
+    EXPECT_EQ(view_h(i), data_view_host[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_h(i) << " "
     // 	      << "std: " << data_view_host(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
index 02867478d..2012fde00 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -169,7 +167,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test,
                    std_gold_h.begin());
 
   for (std::size_t i = 0; i < ext; ++i) {
-    EXPECT_TRUE(view_test_h(i) == std_gold_h[i]);
+    EXPECT_EQ(view_test_h(i), std_gold_h[i]);
     // std::cout << "i= " << i << " "
     // 	      << "from: " << view_from_h(i) << " "
     // 	      << "mine: " << view_test_h(i) << " "
@@ -207,7 +205,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit  = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it,
                                KE::cend(view_from), KE::begin(view_dest));
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -217,7 +215,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit = KE::rotate_copy("label", exespace(), KE::cbegin(view_from), n_it,
                                KE::cend(view_from), KE::begin(view_dest));
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -226,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit =
         KE::rotate_copy(exespace(), view_from, rotation_point, view_dest);
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   {
@@ -235,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto rit = KE::rotate_copy("label", exespace(), view_from, rotation_point,
                                view_dest);
     verify_data(view_from, view_dest, rotation_point);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + view_ext));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + view_ext));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsScalarRedVsView.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsScalarRedVsView.cpp
deleted file mode 100644
index c054dfcc1..000000000
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsScalarRedVsView.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <gtest/gtest.h>
-#include <TestStdAlgorithmsHelperFunctors.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_MinMaxElementOperations.hpp>
-
-namespace KE = Kokkos::Experimental;
-
-namespace Test {
-namespace stdalgos {
-
-template <class ViewType>
-void fill_view(ViewType dest_view) {
-  using value_type = typename ViewType::value_type;
-  using exe_space  = typename ViewType::execution_space;
-  using aux_view_t = Kokkos::View<value_type*, exe_space>;
-
-  const std::size_t ext = dest_view.extent(0);
-  aux_view_t aux_view("aux_view", ext);
-  auto v_h = create_mirror_view(Kokkos::HostSpace(), aux_view);
-
-  for (std::size_t i = 0; i < ext; ++i) {
-    v_h(i) = (value_type)i;
-  }
-  v_h(ext / 2) = (value_type)-101;
-
-  Kokkos::deep_copy(aux_view, v_h);
-  CopyFunctor<aux_view_t, ViewType> F1(aux_view, dest_view);
-  Kokkos::parallel_for("copy", dest_view.extent(0), F1);
-}
-
-template <class ViewType, class IndexType, class ReducerType>
-struct MyFunctor {
-  using red_value_type = typename ReducerType::value_type;
-
-  ViewType m_view;
-  ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const IndexType i, red_value_type& red_value) const {
-    m_reducer.join(red_value, red_value_type{m_view(i), i});
-  }
-
-  KOKKOS_FUNCTION
-  MyFunctor(ViewType view, ReducerType reducer)
-      : m_view(view), m_reducer(std::move(reducer)) {}
-};
-
-TEST(scalar_vs_view_red, use_scalar) {
-  using exe_space   = Kokkos::DefaultExecutionSpace;
-  using index_type  = int;
-  using scalar_type = int;
-  using view_type   = Kokkos::View<scalar_type*, exe_space>;
-
-  const auto ext = 10001;
-  view_type view("myview", ext);
-  fill_view(view);
-
-  using reducer_type    = ::Kokkos::MinLoc<scalar_type, index_type>;
-  using red_result_type = typename reducer_type::value_type;
-  using func_type       = MyFunctor<view_type, index_type, reducer_type>;
-  red_result_type result;
-  reducer_type reducer(result);
-  Kokkos::parallel_reduce("MinLocReduce",
-                          Kokkos::RangePolicy<exe_space>(exe_space(), 0, ext),
-                          func_type(view, reducer), reducer);
-  std::cout << " use_scalar = " << result.val << '\n';
-}
-
-template <class IteratorType, class ReducerType>
-struct StdMyMinFunctor {
-  using index_type     = typename IteratorType::difference_type;
-  using red_value_type = typename ReducerType::value_type;
-
-  IteratorType m_first;
-  ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const index_type i, red_value_type& red_value) const {
-    m_reducer.join(red_value, red_value_type{m_first[i], i});
-  }
-
-  KOKKOS_FUNCTION
-  StdMyMinFunctor(IteratorType first, ReducerType reducer)
-      : m_first(std::move(first)), m_reducer(std::move(reducer)) {}
-};
-
-template <class ViewType, class ReducerType>
-struct StdMyMinFunctor2 {
-  using red_value_type = typename ReducerType::value_type;
-
-  ViewType m_view;
-  ReducerType m_reducer;
-
-  KOKKOS_FUNCTION
-  void operator()(const std::size_t i, red_value_type& red_value) const {
-    m_reducer.join(red_value, red_value_type{m_view(i), i});
-  }
-
-  KOKKOS_FUNCTION
-  StdMyMinFunctor2(ViewType viewIn, ReducerType reducer)
-      : m_view(viewIn), m_reducer(std::move(reducer)) {}
-};
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType my_min_1(const ExecutionSpace& ex, IteratorType first,
-                      IteratorType last) {
-  using index_type = typename IteratorType::difference_type;
-  using value_type = typename IteratorType::value_type;
-  using reducer_type =
-      Kokkos::MinFirstLoc<value_type, index_type, ExecutionSpace>;
-  using result_view_type = typename reducer_type::result_view_type;
-  using func_t           = StdMyMinFunctor<IteratorType, reducer_type>;
-
-  result_view_type result("min_or_max_elem_impl_result");
-  reducer_type reducer(result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(
-      "label", Kokkos::RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(first, reducer), reducer);
-  const auto result_h =
-      ::Kokkos::create_mirror_view_and_copy(::Kokkos::HostSpace(), result);
-  return first + result_h().loc;
-}
-
-template <class ExecutionSpace, class IteratorType>
-IteratorType my_min_2(const ExecutionSpace& ex, IteratorType first,
-                      IteratorType last) {
-  using index_type   = typename IteratorType::difference_type;
-  using value_type   = typename IteratorType::value_type;
-  using reducer_type = Kokkos::MinFirstLoc<value_type, index_type>;
-  using result_type  = typename reducer_type::value_type;
-  using func_t       = StdMyMinFunctor<IteratorType, reducer_type>;
-
-  result_type result;
-  reducer_type reducer(result);
-  const auto num_elements = Kokkos::Experimental::distance(first, last);
-  ::Kokkos::parallel_reduce(
-      "label", Kokkos::RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(first, reducer), reducer);
-  return first + result.loc;
-}
-
-template <class ExecutionSpace, class ViewType>
-std::size_t my_min_3(const ExecutionSpace& ex, ViewType view) {
-  using index_type   = std::size_t;
-  using value_type   = typename ViewType::value_type;
-  using reducer_type = Kokkos::MinFirstLoc<value_type, index_type>;
-  using result_type  = typename reducer_type::value_type;
-  using func_t       = StdMyMinFunctor2<ViewType, reducer_type>;
-
-  result_type result;
-  reducer_type reducer(result);
-  const auto num_elements = view.extent(0);
-  ::Kokkos::parallel_reduce(
-      "label", Kokkos::RangePolicy<ExecutionSpace>(ex, 0, num_elements),
-      func_t(view, reducer), reducer);
-  return result.loc;
-}
-
-TEST(scalar_vs_view_red, my_min_it_use_result_view) {
-  using exe_space = Kokkos::DefaultExecutionSpace;
-  using view_type = Kokkos::View<int*, exe_space>;
-  view_type view("myview", 10001);
-  fill_view(view);
-
-  auto rit = my_min_1(exe_space(), KE::cbegin(view), KE::cend(view));
-  std::cout << " my_min_el = " << KE::distance(KE::cbegin(view), rit) << '\n';
-}
-
-TEST(scalar_vs_view_red, my_min_no_it_use_result_scalar) {
-  using exe_space = Kokkos::DefaultExecutionSpace;
-  using view_type = Kokkos::View<int*, exe_space>;
-  view_type view("myview", 10001);
-  fill_view(view);
-
-  auto ind = my_min_3(exe_space(), view);
-  std::cout << " my_min_el = " << ind << '\n';
-}
-
-TEST(scalar_vs_view_red, my_min_it_use_result_scalar) {
-  using exe_space = Kokkos::DefaultExecutionSpace;
-  using view_type = Kokkos::View<int*, exe_space>;
-  view_type view("myview", 10001);
-  fill_view(view);
-
-  auto rit = my_min_2(exe_space(), KE::cbegin(view), KE::cend(view));
-  std::cout << " my_min_el = " << KE::distance(KE::cbegin(view), rit) << '\n';
-}
-
-}  // namespace stdalgos
-}  // namespace Test
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
index 20e93e964..e57385a8b 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -289,7 +287,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                             KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
@@ -298,21 +296,21 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext,
                    KE::cbegin(s_view), KE::cend(s_view), args...);
     const auto mydiff  = myrit - KE::cbegin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::search(exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit         = KE::search("label", exespace(), view, s_view, args...);
     const auto mydiff  = myrit - KE::begin(view);
     const auto stddiff = stdrit - KE::cbegin(view_h);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
index 4d5416635..31446046a 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -233,26 +231,26 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count,
     auto myrit = KE::search_n(exespace(), KE::cbegin(view), KE::cend(view),
                               count, value, args...);
     const auto mydiff = myrit - KE::cbegin(view);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit        = KE::search_n("label", exespace(), KE::cbegin(view),
                               KE::cend(view), count, value, args...);
     const auto mydiff = myrit - KE::cbegin(view);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit = KE::search_n("label", exespace(), view, count, value, args...);
     const auto mydiff = myrit - KE::begin(view);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   {
     auto myrit        = KE::search_n(exespace(), view, count, value, args...);
     const auto mydiff = myrit - KE::begin(view);
-    EXPECT_TRUE(mydiff == stddiff);
+    EXPECT_EQ(mydiff, stddiff);
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
index 2af0b438a..0c97f255e 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -133,12 +131,12 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = result_it - KE::begin(view);
   const auto std_diff = std_rit - KE::begin(data_view_host);
-  EXPECT_TRUE(my_diff == std_diff);
+  EXPECT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h = create_host_space_copy(view);
   for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) {
-    EXPECT_TRUE(view_h(i) == data_view_host[i]);
+    EXPECT_EQ(view_h(i), data_view_host[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << view_h(i) << " "
     // 	      << "std: " << data_view_host(i)
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
index ae85e5c6d..d8aa350f1 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -131,14 +129,14 @@ void verify_data(ResultIt result_it, ViewType view, ViewHostType data_view_host,
   // make sure results match
   const auto my_diff  = KE::end(view) - result_it;
   const auto std_diff = KE::end(data_view_host) - std_rit;
-  EXPECT_TRUE(my_diff == std_diff);
+  EXPECT_EQ(my_diff, std_diff);
 
   // check views match
   auto view_h = create_host_space_copy(view);
   auto it1    = KE::cbegin(view_h);
   auto it2    = KE::cbegin(data_view_host);
   for (std::size_t i = 0; i < (std::size_t)my_diff; ++i) {
-    EXPECT_TRUE(it1[i] == it2[i]);
+    EXPECT_EQ(it1[i], it2[i]);
     // std::cout << "i= " << i << " "
     // 	      << "mine: " << it1[i] << " "
     // 	      << "std:  " << it2[i]
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
index e77857344..e415eff06 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
 #include <utility>
 
 namespace Test {
@@ -195,7 +193,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_TRUE(gold_h(i) == test_view_h(i));
+        EXPECT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
         if (error > 1e-10) {
@@ -203,7 +201,7 @@ void verify_data(ViewType1 data_view,  // contains data
                     << " " << gold_h(i) << " " << test_view_h(i) << " "
                     << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
         }
-        EXPECT_TRUE(error < 1e-10);
+        EXPECT_LT(error, 1e-10);
       }
     }
     // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
@@ -223,12 +221,6 @@ struct SumBinaryFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a + b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a + b);
-  }
 };
 
 std::string value_type_to_string(int) { return "int"; }
@@ -257,7 +249,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     auto r = KE::transform_exclusive_scan(
         exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), init_value, bop, uop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -266,7 +258,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     auto r = KE::transform_exclusive_scan(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), init_value, bop, uop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -274,7 +266,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     fill_zero(view_dest);
     auto r = KE::transform_exclusive_scan(exespace(), view_from, view_dest,
                                           init_value, bop, uop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -282,7 +274,7 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value,
     fill_zero(view_dest);
     auto r = KE::transform_exclusive_scan("label", exespace(), view_from,
                                           view_dest, init_value, bop, uop);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, init_value, bop, uop);
   }
 
@@ -306,7 +298,7 @@ void run_all_scenarios() {
   }
 }
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan) {
   run_all_scenarios<DynamicTag, double>();
   run_all_scenarios<StridedThreeTag, double>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
index a30d67379..21ce01fb1 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_Numeric.hpp>
 #include <utility>
 
 namespace Test {
@@ -207,7 +205,7 @@ void verify_data(ViewType1 data_view,  // contains data
       //           << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
 
       if (std::is_same<gold_view_value_type, int>::value) {
-        EXPECT_TRUE(gold_h(i) == test_view_h(i));
+        EXPECT_EQ(gold_h(i), test_view_h(i));
       } else {
         const auto error = std::abs(gold_h(i) - test_view_h(i));
         if (error > 1e-10) {
@@ -215,7 +213,7 @@ void verify_data(ViewType1 data_view,  // contains data
                     << " " << gold_h(i) << " " << test_view_h(i) << " "
                     << std::abs(gold_h(i) - test_view_h(i)) << std::endl;
         }
-        EXPECT_TRUE(error < 1e-10);
+        EXPECT_LT(error, 1e-10);
       }
     }
     // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) <<
@@ -235,12 +233,6 @@ struct SumBinaryFunctor {
   ValueType operator()(const ValueType& a, const ValueType& b) const {
     return (a + b);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator()(const volatile ValueType& a,
-                       const volatile ValueType& b) const {
-    return (a + b);
-  }
 };
 
 std::string value_type_to_string(int) { return "int"; }
@@ -282,7 +274,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_from),
                                           KE::cend(view_from),
                                           KE::begin(view_dest), args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -291,7 +283,7 @@ void run_single_scenario(const InfoType& scenario_info,
     auto r = KE::transform_inclusive_scan(
         "label", exespace(), KE::cbegin(view_from), KE::cend(view_from),
         KE::begin(view_dest), args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -299,7 +291,7 @@ void run_single_scenario(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r =
         KE::transform_inclusive_scan(exespace(), view_from, view_dest, args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -307,7 +299,7 @@ void run_single_scenario(const InfoType& scenario_info,
     fill_zero(view_dest);
     auto r = KE::transform_inclusive_scan("label", exespace(), view_from,
                                           view_dest, args...);
-    EXPECT_TRUE(r == KE::end(view_dest));
+    EXPECT_EQ(r, KE::end(view_dest));
     verify_data(view_from, view_dest, args...);
   }
 
@@ -333,7 +325,7 @@ void run_all_scenarios() {
   }
 }
 
-#if not defined KOKKOS_ENABLE_OPENMPTARGET
+#if !defined KOKKOS_ENABLE_OPENMPTARGET
 TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) {
   run_all_scenarios<DynamicTag, double>();
   // run_all_scenarios<StridedThreeTag, double>();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
index 7adfc10c3..35c293adc 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformUnaryOp.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -88,7 +86,7 @@ void verify_data(ViewTypeFrom view_from, ViewTypeTest view_test) {
       create_mirror_view_and_copy(Kokkos::HostSpace(), view_from_dc);
 
   for (std::size_t i = 0; i < view_test_h.extent(0); ++i) {
-    EXPECT_TRUE(view_test_h(i) == view_from_h(i) + value_type(1));
+    EXPECT_EQ(view_test_h(i), view_from_h(i) + value_type(1));
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
index a13ba8240..88dd4d259 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 #include <algorithm>
 
@@ -187,7 +185,7 @@ void verify_data(const std::string& name, ResultIt my_result_it,
   //
   const auto std_diff = (std::size_t)(std_r - KE::begin(data_v_h));
   const auto my_diff  = (std::size_t)(my_result_it - KE::begin(view_test));
-  EXPECT_TRUE(my_diff == std_diff);
+  EXPECT_EQ(my_diff, std_diff);
 
   //
   // check the data in the view
@@ -200,14 +198,14 @@ void verify_data(const std::string& name, ResultIt my_result_it,
     // 		<< " my  = " << view_test_h(i) << " "
     // 		<< " std = " << data_v_h(i)
     // 		<< '\n';
-    EXPECT_TRUE(view_test_h(i) == data_v_h(i));
+    EXPECT_EQ(view_test_h(i), data_v_h(i));
   }
 
   if (name == "medium-b") {
     using value_type = typename ViewType1::value_type;
-    EXPECT_TRUE(my_diff == (std::size_t)2);
-    EXPECT_TRUE(view_test_h(0) == (value_type)22);
-    EXPECT_TRUE(view_test_h(1) == (value_type)44);
+    EXPECT_EQ(my_diff, (std::size_t)2);
+    EXPECT_EQ(view_test_h(0), (value_type)22);
+    EXPECT_EQ(view_test_h(1), (value_type)44);
   }
 }
 
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
index 64f205b28..fdede9517 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp
@@ -43,8 +43,6 @@
 */
 
 #include <TestStdAlgorithmsCommon.hpp>
-#include <std_algorithms/Kokkos_BeginEnd.hpp>
-#include <std_algorithms/Kokkos_ModifyingSequenceOperations.hpp>
 #include <utility>
 
 namespace Test {
@@ -204,51 +202,51 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
   }
 
   else if (name == "one-element-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(1));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
   }
 
   else if (name == "one-element-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(1));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(2));
   }
 
   else if (name == "two-elements-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(-1));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(-1));
   }
 
   else if (name == "small-a") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(1));
-    EXPECT_TRUE(view_test_h(2) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(3) == static_cast<value_type>(3));
-    EXPECT_TRUE(view_test_h(4) == static_cast<value_type>(4));
-    EXPECT_TRUE(view_test_h(5) == static_cast<value_type>(5));
-    EXPECT_TRUE(view_test_h(6) == static_cast<value_type>(6));
-    EXPECT_TRUE(view_test_h(7) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(8) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(9) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(10) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(1));
+    EXPECT_EQ(view_test_h(2), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(3), static_cast<value_type>(3));
+    EXPECT_EQ(view_test_h(4), static_cast<value_type>(4));
+    EXPECT_EQ(view_test_h(5), static_cast<value_type>(5));
+    EXPECT_EQ(view_test_h(6), static_cast<value_type>(6));
+    EXPECT_EQ(view_test_h(7), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(8), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(9), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(10), static_cast<value_type>(0));
   }
 
   else if (name == "small-b") {
-    EXPECT_TRUE(view_test_h(0) == static_cast<value_type>(1));
-    EXPECT_TRUE(view_test_h(1) == static_cast<value_type>(2));
-    EXPECT_TRUE(view_test_h(2) == static_cast<value_type>(3));
-    EXPECT_TRUE(view_test_h(3) == static_cast<value_type>(4));
-    EXPECT_TRUE(view_test_h(4) == static_cast<value_type>(5));
-    EXPECT_TRUE(view_test_h(5) == static_cast<value_type>(6));
-    EXPECT_TRUE(view_test_h(6) == static_cast<value_type>(8));
-    EXPECT_TRUE(view_test_h(7) == static_cast<value_type>(9));
-    EXPECT_TRUE(view_test_h(8) == static_cast<value_type>(8));
-    EXPECT_TRUE(view_test_h(9) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(10) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(11) == static_cast<value_type>(0));
-    EXPECT_TRUE(view_test_h(12) == static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(0), static_cast<value_type>(1));
+    EXPECT_EQ(view_test_h(1), static_cast<value_type>(2));
+    EXPECT_EQ(view_test_h(2), static_cast<value_type>(3));
+    EXPECT_EQ(view_test_h(3), static_cast<value_type>(4));
+    EXPECT_EQ(view_test_h(4), static_cast<value_type>(5));
+    EXPECT_EQ(view_test_h(5), static_cast<value_type>(6));
+    EXPECT_EQ(view_test_h(6), static_cast<value_type>(8));
+    EXPECT_EQ(view_test_h(7), static_cast<value_type>(9));
+    EXPECT_EQ(view_test_h(8), static_cast<value_type>(8));
+    EXPECT_EQ(view_test_h(9), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(10), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(11), static_cast<value_type>(0));
+    EXPECT_EQ(view_test_h(12), static_cast<value_type>(0));
   }
 
   else if (name == "medium" || name == "large") {
@@ -260,7 +258,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from,
     (void)std_r;
 
     for (std::size_t i = 0; i < view_from_h.extent(0); ++i) {
-      EXPECT_TRUE(view_test_h(i) == tmp[i]);
+      EXPECT_EQ(view_test_h(i), tmp[i]);
     }
   }
 
@@ -303,7 +301,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         KE::unique_copy(exespace(), KE::cbegin(view_from), KE::cend(view_from),
                         KE::begin(view_dest), args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -313,7 +311,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         KE::unique_copy("label", exespace(), KE::cbegin(view_from),
                         KE::cend(view_from), KE::begin(view_dest), args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -321,7 +319,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
         create_view<ValueType>(Tag{}, view_ext, "unique_copy_dest");
     auto rit = KE::unique_copy(exespace(), view_from, view_dest, args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   {
@@ -330,7 +328,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) {
     auto rit =
         KE::unique_copy("label", exespace(), view_from, view_dest, args...);
     verify_data(name, view_from, view_dest, args...);
-    EXPECT_TRUE(rit == (KE::begin(view_dest) + n));
+    EXPECT_EQ(rit, (KE::begin(view_dest) + n));
   }
 
   Kokkos::fence();
diff --git a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp
index 3ed2ecd83..a88860749 100644
--- a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp
+++ b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp
@@ -115,9 +115,9 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) {
             << "\n";
 
   using view_value_type = typename ViewType::value_type;
-  using reducer_type    = typename std::conditional<
+  using reducer_type    = std::conditional_t<
       (flag == 0), Kokkos::MaxFirstLoc<view_value_type, IndexType, ExeSpace>,
-      Kokkos::MinFirstLoc<view_value_type, IndexType, ExeSpace> >::type;
+      Kokkos::MinFirstLoc<view_value_type, IndexType, ExeSpace> >;
   using reduction_value_type = typename reducer_type::value_type;
 
   reduction_value_type red_result;
diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml
index 73a0d3187..e63fec718 100644
--- a/packages/kokkos/appveyor.yml
+++ b/packages/kokkos/appveyor.yml
@@ -5,6 +5,6 @@ build_script:
 - cmd: >-
     mkdir build &&
     cd build &&
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ARCH_NATIVE=ON &&
     cmake --build . --target install &&
     ctest -C Debug --output-on-failure
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
index 4fc6ca2c6..be190e868 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -47,20 +47,21 @@
 
 template <class Scalar, int Unroll, int Stride>
 struct Run {
-  static void run(int N, int K, int R, int F, int T, int S);
+  static void run(int N, int K, int R, int F, int T, int S, int B, int I);
 };
 
 template <class Scalar, int Stride>
 struct RunStride {
-  static void run_1(int N, int K, int R, int F, int T, int S);
-  static void run_2(int N, int K, int R, int F, int T, int S);
-  static void run_3(int N, int K, int R, int F, int T, int S);
-  static void run_4(int N, int K, int R, int F, int T, int S);
-  static void run_5(int N, int K, int R, int F, int T, int S);
-  static void run_6(int N, int K, int R, int F, int T, int S);
-  static void run_7(int N, int K, int R, int F, int T, int S);
-  static void run_8(int N, int K, int R, int F, int T, int S);
-  static void run(int N, int K, int R, int U, int F, int T, int S);
+  static void run_1(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_2(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_3(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_4(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_5(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_6(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_7(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run_8(int N, int K, int R, int F, int T, int S, int B, int I);
+  static void run(int N, int K, int R, int U, int F, int T, int S, int B,
+                  int I);
 };
 
 #define STRIDE 1
@@ -83,11 +84,12 @@ struct RunStride {
 #undef STRIDE
 
 template <class Scalar>
-void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) {
-  if (D == 1) RunStride<Scalar, 1>::run(N, K, R, U, F, T, S);
-  if (D == 2) RunStride<Scalar, 2>::run(N, K, R, U, F, T, S);
-  if (D == 4) RunStride<Scalar, 4>::run(N, K, R, U, F, T, S);
-  if (D == 8) RunStride<Scalar, 8>::run(N, K, R, U, F, T, S);
-  if (D == 16) RunStride<Scalar, 16>::run(N, K, R, U, F, T, S);
-  if (D == 32) RunStride<Scalar, 32>::run(N, K, R, U, F, T, S);
+void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S,
+                       int B, int I) {
+  if (D == 1) RunStride<Scalar, 1>::run(N, K, R, U, F, T, S, B, I);
+  if (D == 2) RunStride<Scalar, 2>::run(N, K, R, U, F, T, S, B, I);
+  if (D == 4) RunStride<Scalar, 4>::run(N, K, R, U, F, T, S, B, I);
+  if (D == 8) RunStride<Scalar, 8>::run(N, K, R, U, F, T, S, B, I);
+  if (D == 16) RunStride<Scalar, 16>::run(N, K, R, U, F, T, S, B, I);
+  if (D == 32) RunStride<Scalar, 32>::run(N, K, R, U, F, T, S, B, I);
 }
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp
index fb8523777..73ad21b05 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp
@@ -45,4 +45,4 @@
 #include <bench.hpp>
 
 template void run_stride_unroll<double>(int N, int K, int R, int D, int U,
-                                        int F, int T, int S);
+                                        int F, int T, int S, int B, int I);
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp
index 1df7a78c2..3964df3fa 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp
@@ -45,4 +45,4 @@
 #include <bench.hpp>
 
 template void run_stride_unroll<float>(int N, int K, int R, int D, int U, int F,
-                                       int T, int S);
+                                       int T, int S, int B, int I);
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp
index 35fe4db37..d63c30807 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp
@@ -45,4 +45,4 @@
 #include <bench.hpp>
 
 template void run_stride_unroll<int32_t>(int N, int K, int R, int D, int U,
-                                         int F, int T, int S);
+                                         int F, int T, int S, int B, int I);
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp
index 3ce15027b..51a31b16f 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp
@@ -45,4 +45,4 @@
 #include <bench.hpp>
 
 template void run_stride_unroll<int64_t>(int N, int K, int R, int D, int U,
-                                         int F, int T, int S);
+                                         int F, int T, int S, int B, int I);
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
index 64817fe9d..c29f2a18c 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
@@ -69,55 +69,56 @@
 
 template <class Scalar>
 struct RunStride<Scalar, STRIDE> {
-  static void run_1(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 1, STRIDE>::run(N, K, R, F, T, S);
+  static void run_1(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 1, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_2(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 2, STRIDE>::run(N, K, R, F, T, S);
+  static void run_2(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 2, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_3(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 3, STRIDE>::run(N, K, R, F, T, S);
+  static void run_3(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 3, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_4(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 4, STRIDE>::run(N, K, R, F, T, S);
+  static void run_4(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 4, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_5(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 5, STRIDE>::run(N, K, R, F, T, S);
+  static void run_5(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 5, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_6(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 6, STRIDE>::run(N, K, R, F, T, S);
+  static void run_6(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 6, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_7(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 7, STRIDE>::run(N, K, R, F, T, S);
+  static void run_7(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 7, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
-  static void run_8(int N, int K, int R, int F, int T, int S) {
-    Run<Scalar, 8, STRIDE>::run(N, K, R, F, T, S);
+  static void run_8(int N, int K, int R, int F, int T, int S, int B, int I) {
+    Run<Scalar, 8, STRIDE>::run(N, K, R, F, T, S, B, I);
   }
 
-  static void run(int N, int K, int R, int U, int F, int T, int S) {
+  static void run(int N, int K, int R, int U, int F, int T, int S, int B,
+                  int I) {
     if (U == 1) {
-      run_1(N, K, R, F, T, S);
+      run_1(N, K, R, F, T, S, B, I);
     }
     if (U == 2) {
-      run_2(N, K, R, F, T, S);
+      run_2(N, K, R, F, T, S, B, I);
     }
     if (U == 3) {
-      run_3(N, K, R, F, T, S);
+      run_3(N, K, R, F, T, S, B, I);
     }
     if (U == 4) {
-      run_4(N, K, R, F, T, S);
+      run_4(N, K, R, F, T, S, B, I);
     }
     if (U == 5) {
-      run_5(N, K, R, F, T, S);
+      run_5(N, K, R, F, T, S, B, I);
     }
     if (U == 6) {
-      run_6(N, K, R, F, T, S);
+      run_6(N, K, R, F, T, S, B, I);
     }
     if (U == 7) {
-      run_7(N, K, R, F, T, S);
+      run_7(N, K, R, F, T, S, B, I);
     }
     if (U == 8) {
-      run_8(N, K, R, F, T, S);
+      run_8(N, K, R, F, T, S, B, I);
     }
   }
 };
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
index 00ce635a4..58bf17b0b 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
@@ -44,7 +44,7 @@
 
 template <class Scalar>
 struct Run<Scalar, UNROLL, STRIDE> {
-  static void run(int N, int K, int R, int F, int T, int S) {
+  static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) {
     Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> A("A", N, K);
     Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> B("B", N, K);
     Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> C("C", N, K);
@@ -54,98 +54,102 @@ struct Run<Scalar, UNROLL, STRIDE> {
     Kokkos::deep_copy(C, Scalar(3.5));
 
     Kokkos::Timer timer;
-    Kokkos::parallel_for(
-        "BenchmarkKernel",
-        Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)),
-        KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) {
-          const int n = team.league_rank();
-          for (int r = 0; r < R; r++) {
-            Kokkos::parallel_for(
-                Kokkos::TeamThreadRange(team, 0, K), [&](const int& i) {
-                  Scalar a1      = A(n, i, 0);
-                  const Scalar b = B(n, i, 0);
+    for (int i = 0; i < I; ++i) {
+      Kokkos::parallel_for(
+          "BenchmarkKernel",
+          Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)),
+          KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) {
+            const int n = team.league_rank();
+            for (int r = 0; r < R; r++) {
+              Kokkos::parallel_for(
+                  Kokkos::TeamThreadRange(team, 0, K), [&](const int& i) {
+                    Scalar a1      = A(n, i, 0);
+                    const Scalar b = B(n, i, 0);
 #if (UNROLL > 1)
-                  Scalar a2 = a1 * 1.3;
+                    Scalar a2 = a1 * 1.3;
 #endif
 #if (UNROLL > 2)
-                  Scalar a3 = a2 * 1.1;
+                    Scalar a3 = a2 * 1.1;
 #endif
 #if (UNROLL > 3)
-                  Scalar a4 = a3 * 1.1;
+                    Scalar a4 = a3 * 1.1;
 #endif
 #if (UNROLL > 4)
-                  Scalar a5 = a4 * 1.3;
+                    Scalar a5 = a4 * 1.3;
 #endif
 #if (UNROLL > 5)
-                  Scalar a6 = a5 * 1.1;
+                    Scalar a6 = a5 * 1.1;
 #endif
 #if (UNROLL > 6)
-                  Scalar a7 = a6 * 1.1;
+                    Scalar a7 = a6 * 1.1;
 #endif
 #if (UNROLL > 7)
-                  Scalar a8 = a7 * 1.1;
+                    Scalar a8 = a7 * 1.1;
 #endif
 
-                  for (int f = 0; f < F; f++) {
-                    a1 += b * a1;
+                    for (int f = 0; f < F; f++) {
+                      a1 += b * a1;
 #if (UNROLL > 1)
-                    a2 += b * a2;
+                      a2 += b * a2;
 #endif
 #if (UNROLL > 2)
-                    a3 += b * a3;
+                      a3 += b * a3;
 #endif
 #if (UNROLL > 3)
-                    a4 += b * a4;
+                      a4 += b * a4;
 #endif
 #if (UNROLL > 4)
-                    a5 += b * a5;
+                      a5 += b * a5;
 #endif
 #if (UNROLL > 5)
-                    a6 += b * a6;
+                      a6 += b * a6;
 #endif
 #if (UNROLL > 6)
-                    a7 += b * a7;
+                      a7 += b * a7;
 #endif
 #if (UNROLL > 7)
-                    a8 += b * a8;
+                      a8 += b * a8;
 #endif
-                  }
+                    }
 #if (UNROLL == 1)
-                  C(n, i, 0) = a1;
+                    C(n, i, 0) = a1;
 #endif
 #if (UNROLL == 2)
-                  C(n, i, 0) = a1 + a2;
+                    C(n, i, 0) = a1 + a2;
 #endif
 #if (UNROLL == 3)
-                  C(n, i, 0) = a1 + a2 + a3;
+                    C(n, i, 0) = a1 + a2 + a3;
 #endif
 #if (UNROLL == 4)
-                  C(n, i, 0) = a1 + a2 + a3 + a4;
+                    C(n, i, 0) = a1 + a2 + a3 + a4;
 #endif
 #if (UNROLL == 5)
-                  C(n, i, 0) = a1 + a2 + a3 + a4 + a5;
+                    C(n, i, 0) = a1 + a2 + a3 + a4 + a5;
 #endif
 #if (UNROLL == 6)
-                  C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6;
+                    C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6;
 #endif
 #if (UNROLL == 7)
-                  C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7;
+                    C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7;
 #endif
 #if (UNROLL == 8)
-                  C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
+                    C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
 #endif
-                });
-          }
-        });
+                  });
+            }
+          });
+    }
     Kokkos::fence();
-    double seconds = timer.seconds();
+    double seconds = timer.seconds() / static_cast<double>(I);
 
     double bytes = 1.0 * N * K * R * 3 * sizeof(Scalar);
+    bytes /= ((Ba == 2) ? (1024 * 1024 * 1024) : (1000 * 1000 * 1000));
     double flops = 1.0 * N * K * R * (F * 2 * UNROLL + 2 * (UNROLL - 1));
     printf(
-        "NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: "
+        "NKRUFTSBI: %i %i %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lf%s "
+        "GFlop/s: "
         "%lf\n",
-        N, K, R, UNROLL, F, T, S, seconds,
-        1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds);
+        N, K, R, UNROLL, F, T, S, Ba, I, seconds, 1.0 * bytes / seconds,
+        Ba == 2 ? "GiB/s" : "GB/s", 1.e-9 * flops / seconds);
   }
 };
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
index f05c7d9f5..b26b8ef5e 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -48,19 +48,19 @@
 #include <cstdlib>
 
 extern template void run_stride_unroll<float>(int, int, int, int, int, int, int,
-                                              int);
+                                              int, int, int);
 extern template void run_stride_unroll<double>(int, int, int, int, int, int,
-                                               int, int);
+                                               int, int, int, int);
 extern template void run_stride_unroll<int32_t>(int, int, int, int, int, int,
-                                                int, int);
+                                                int, int, int, int);
 extern template void run_stride_unroll<int64_t>(int, int, int, int, int, int,
-                                                int, int);
+                                                int, int, int, int);
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize();
 
   if (argc < 10) {
-    printf("Arguments: N K R D U F T S\n");
+    printf("Arguments: N K R D U F T S B I\n");
     printf("  P:   Precision (1==float, 2==double, 3==int32_t, 4==int64_t)\n");
     printf("  N,K: dimensions of the 2D array to allocate\n");
     printf("  R:   how often to loop through the K dimension with each team\n");
@@ -72,6 +72,10 @@ int main(int argc, char* argv[]) {
     printf("  T:   team size\n");
     printf(
         "  S:   shared memory per team (used to control occupancy on GPUs)\n");
+    printf(
+        "  B:   units for reported memory bandwidths (2=GiB, 10=GB, "
+        "default=2)\n");
+    printf("  I:   iterations of the kernel to time over (default=10)\n");
     printf("Example Input GPU:\n");
     printf("  Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n");
     printf("  Cache Bound     : 2 100000 1024 64 1 1 1 512 20000\n");
@@ -92,6 +96,16 @@ int main(int argc, char* argv[]) {
   int T = std::stoi(argv[8]);
   int S = std::stoi(argv[9]);
 
+  int B = 2;
+  if (argc >= 11) {
+    B = std::atoi(argv[10]);
+  }
+
+  int I = 10;
+  if (argc >= 12) {
+    I = std::atoi(argv[11]);
+  }
+
   if (U > 8) {
     printf("U must be 1-8\n");
     return 0;
@@ -105,17 +119,27 @@ int main(int argc, char* argv[]) {
     return 0;
   }
 
+  if ((B != 2) && (B != 10)) {
+    printf("B must be one of 2,10\n");
+    return 0;
+  }
+
+  if (I < 1) {
+    printf("I must be >= 1\n");
+    return 0;
+  }
+
   if (P == 1) {
-    run_stride_unroll<float>(N, K, R, D, U, F, T, S);
+    run_stride_unroll<float>(N, K, R, D, U, F, T, S, B, I);
   }
   if (P == 2) {
-    run_stride_unroll<double>(N, K, R, D, U, F, T, S);
+    run_stride_unroll<double>(N, K, R, D, U, F, T, S, B, I);
   }
   if (P == 3) {
-    run_stride_unroll<int32_t>(N, K, R, D, U, F, T, S);
+    run_stride_unroll<int32_t>(N, K, R, D, U, F, T, S, B, I);
   }
   if (P == 4) {
-    run_stride_unroll<int64_t>(N, K, R, D, U, F, T, S);
+    run_stride_unroll<int64_t>(N, K, R, D, U, F, T, S, B, I);
   }
 
   Kokkos::finalize();
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 8c168412e..e1a208813 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -227,7 +227,7 @@ do
     fi
     ;;
   #Handle known nvcc args
-  --dryrun|--verbose|--keep|--keep-dir*|-G|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
+  --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
@@ -242,6 +242,77 @@ do
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
+  # Handle Werror. Note, we must differentiate between the ones going to nvcc and the host compiler
+  # --Werror kind,... OR --Werror=kind,... <- always to nvcc
+  --Werror)
+    cuda_args="$cuda_args $1 $2"
+    shift
+    ;;
+  --Werror=*)
+    cuda_args="$cuda_args $1"
+    ;;
+  # -Werror kind,... where kind is one of {all-warnings, cross-execution-space-call, reorder, default-stream-launch, missing-launch-bounds, ext-lambda-captures-this, deprecated-declarations} <- goes to nvcc
+  # -Werror not followed by any kind as mentioned above goes to host compiler without any arguments
+  -Werror)
+    if [ $# -gt 1 ]; then
+      IFS="," read -r -a kinds <<< "$2"
+      first_kind=${kinds[0]}
+      # check if the first kind is one of the allowed ones, then this must be an nvcc list so put all of them to the cuda compiler
+      case $first_kind in
+      all-warnings|cross-execution-space-call|reorder|default-stream-launch|missing-launch-bounds|ext-lambda-captures-this|deprecated-declarations)
+        cuda_args="$cuda_args $1 $2"
+        shift
+        ;;
+      *)
+        if [ $first_xcompiler_arg -eq 1 ]; then
+          xcompiler_args="$1"
+          first_xcompiler_arg=0
+        else
+          xcompiler_args="$xcompiler_args,$1"
+        fi
+        ;;
+      esac
+    fi
+    ;;
+  # -Werror=kind,... will be split into two parts, those kinds that belong to nvcc (see above) go there, while all others go towards the host compiler
+  -Werror=*)
+    kinds_str="${1:8}" # strip -Werror=
+    IFS="," read -r -a kinds <<< ${kinds_str}
+    first_werror_cuda=1
+    first_werror_host=1
+    xcompiler_args_werror=
+    # loop over all kinds that are sparated via ','
+    for kind in "${kinds[@]}"
+    do
+      case ${kind} in
+      all-warnings|cross-execution-space-call|reorder|default-stream-launch|missing-launch-bounds|ext-lambda-captures-this|deprecated-declarations)
+        if [ $first_werror_cuda -ne 0 ]; then
+          cuda_args="$cuda_args -Werror="
+          first_werror_cuda=0
+        else
+          cuda_args="$cuda_args,"
+        fi
+        cuda_args="$cuda_args$kind"
+        ;;
+      *)
+        if [ $first_werror_host -eq 0 ]; then
+            xcompiler_args_werror="${xcompiler_args_werror},"
+        fi
+        first_werror_host=0
+        xcompiler_args_werror="$xcompiler_args_werror-Werror=$kind"
+        ;;
+      esac
+    done
+    if [ $first_werror_host -eq 0 ]; then
+      if [ $first_xcompiler_arg -eq 1 ]; then
+        xcompiler_args="$xcompiler_args_werror"
+        first_xcompiler_arg=0
+      else
+        xcompiler_args="$xcompiler_args,$xcompiler_args_werror"
+      fi
+    fi
+    ;;
+  # End of Werror handling
   #Handle unsupported standard flags
   --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a)
     fallback_std_flag="-std=c++14"
@@ -310,7 +381,7 @@ do
   -std=c++98|--std=c++98)
     ;;
   #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
-  -pedantic|-Wpedantic|-ansi)
+  -pedantic|-pedantic-errors|-Wpedantic|-ansi)
     ;;
   #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
   -Woverloaded-virtual)
diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake
index c0be9f564..10df9fe45 100644
--- a/packages/kokkos/cmake/Dependencies.cmake
+++ b/packages/kokkos/cmake/Dependencies.cmake
@@ -1,3 +1,14 @@
+IF (CMAKE_CXX_STANDARD GREATER_EQUAL 17)
+  SET(KOKKOS_SIMD_TEST_CLASS PT)
+ELSE()
+  SET(KOKKOS_SIMD_TEST_CLASS EX)
+  IF (${PROJECT_NAME}_ENABLE_KokkosSimd)
+    MESSAGE(WARNING "KokkosSimd is explicitly enabled but C++17 is not available")
+  ELSE()
+    MESSAGE(STATUS "Disabling KokkosSimd by default because C++17 is not available")
+  ENDIF()
+ENDIF()
+
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
   SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
     #SubPackageName       Directory         Class    Req/Opt
@@ -6,4 +17,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
     Core                  core              PS       REQUIRED
     Containers            containers        PS       OPTIONAL
     Algorithms            algorithms        PS       OPTIONAL
+    Simd                  simd              ${KOKKOS_SIMD_TEST_CLASS}       OPTIONAL
   )
diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
index 6788e77ad..23bc86cc8 100644
--- a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
+++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in
@@ -4,6 +4,7 @@ SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@)
 SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@)
 SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
 SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@")
+SET(Kokkos_CXX_STANDARD @KOKKOS_CXX_STANDARD@)
 
 # These are needed by KokkosKernels
 FOREACH(DEV ${Kokkos_DEVICES})
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index f3fd9f6d7..34807ac2b 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -14,6 +14,7 @@
 /* Execution Spaces */
 #cmakedefine KOKKOS_ENABLE_SERIAL
 #cmakedefine KOKKOS_ENABLE_OPENMP
+#cmakedefine KOKKOS_ENABLE_OPENACC
 #cmakedefine KOKKOS_ENABLE_OPENMPTARGET
 #cmakedefine KOKKOS_ENABLE_THREADS
 #cmakedefine KOKKOS_ENABLE_CUDA
@@ -23,14 +24,6 @@
 #cmakedefine KOKKOS_ENABLE_LIBRT
 #cmakedefine KOKKOS_ENABLE_SYCL
 
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_TM
-#cmakedefine KOKKOS_USE_ISA_X86_64
-#cmakedefine KOKKOS_USE_ISA_KNC
-#cmakedefine KOKKOS_USE_ISA_POWERPCLE
-#cmakedefine KOKKOS_USE_ISA_POWERPCBE
-#endif
-
 /* General Settings */
 #cmakedefine KOKKOS_ENABLE_CXX14
 #cmakedefine KOKKOS_ENABLE_CXX17
@@ -48,13 +41,11 @@
 #cmakedefine KOKKOS_ENABLE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
-#cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
 #cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake
index 5fc6a6930..8adcdcdbb 100644
--- a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake
@@ -1 +1 @@
-KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h  LIBRARY dl)
+KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS})
diff --git a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake
index 2d140c85c..0e3c9f8dd 100644
--- a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake
@@ -1,5 +1,5 @@
 INCLUDE(FindPackageHandleStandardArgs)
-INCLUDE("${CMAKE_SOURCE_DIR}/cmake/tpls/FindTPLPthread.cmake")
+INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/cmake/tpls/FindTPLPthread.cmake")
 
 IF (TARGET Threads::Threads)
   SET(FOUND_THREADS TRUE)
diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake
index fbd6745a6..f39457205 100644
--- a/packages/kokkos/cmake/fake_tribits.cmake
+++ b/packages/kokkos/cmake/fake_tribits.cmake
@@ -3,8 +3,6 @@
 INCLUDE(CMakeParseArguments)
 INCLUDE(CTest)
 
-cmake_policy(SET CMP0054 NEW)
-
 FUNCTION(ASSERT_DEFINED VARS)
   FOREACH(VAR ${VARS})
     IF(NOT DEFINED ${VAR})
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index a8b5b6847..d4c2cda65 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -1,7 +1,7 @@
 
-FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION)
+FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY)
   #all optimizations off by default
-  KOKKOS_OPTION(ARCH_${SUFFIX} OFF BOOL "Optimize for ${DESCRIPTION} (${DEV_TYPE})")
+  KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF)
   SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE)
   SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
   SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
@@ -30,51 +30,83 @@ SET(KOKKOS_ARCH_LIST)
 
 
 KOKKOS_DEPRECATED_LIST(ARCH ARCH)
-KOKKOS_ARCH_OPTION(AMDAVX          HOST "AMD chip")
-KOKKOS_ARCH_OPTION(ARMV80          HOST "ARMv8.0 Compatible CPU")
-KOKKOS_ARCH_OPTION(ARMV81          HOST "ARMv8.1 Compatible CPU")
-KOKKOS_ARCH_OPTION(ARMV8_THUNDERX  HOST "ARMv8 Cavium ThunderX CPU")
-KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU")
-KOKKOS_ARCH_OPTION(A64FX           HOST "ARMv8.2 with SVE Support")
-KOKKOS_ARCH_OPTION(WSM             HOST "Intel Westmere CPU")
-KOKKOS_ARCH_OPTION(SNB             HOST "Intel Sandy/Ivy Bridge CPUs")
-KOKKOS_ARCH_OPTION(HSW             HOST "Intel Haswell CPUs")
-KOKKOS_ARCH_OPTION(BDW             HOST "Intel Broadwell Xeon E-class CPUs")
-KOKKOS_ARCH_OPTION(SKX             HOST "Intel Sky Lake Xeon E-class HPC CPUs (AVX512)")
-KOKKOS_ARCH_OPTION(KNC             HOST "Intel Knights Corner Xeon Phi")
-KOKKOS_ARCH_OPTION(KNL             HOST "Intel Knights Landing Xeon Phi")
-KOKKOS_ARCH_OPTION(BGQ             HOST "IBM Blue Gene Q")
-KOKKOS_ARCH_OPTION(POWER7          HOST "IBM POWER7 CPUs")
-KOKKOS_ARCH_OPTION(POWER8          HOST "IBM POWER8 CPUs")
-KOKKOS_ARCH_OPTION(POWER9          HOST "IBM POWER9 CPUs")
-KOKKOS_ARCH_OPTION(KEPLER30        GPU  "NVIDIA Kepler generation CC 3.0")
-KOKKOS_ARCH_OPTION(KEPLER32        GPU  "NVIDIA Kepler generation CC 3.2")
-KOKKOS_ARCH_OPTION(KEPLER35        GPU  "NVIDIA Kepler generation CC 3.5")
-KOKKOS_ARCH_OPTION(KEPLER37        GPU  "NVIDIA Kepler generation CC 3.7")
-KOKKOS_ARCH_OPTION(MAXWELL50       GPU  "NVIDIA Maxwell generation CC 5.0")
-KOKKOS_ARCH_OPTION(MAXWELL52       GPU  "NVIDIA Maxwell generation CC 5.2")
-KOKKOS_ARCH_OPTION(MAXWELL53       GPU  "NVIDIA Maxwell generation CC 5.3")
-KOKKOS_ARCH_OPTION(PASCAL60        GPU  "NVIDIA Pascal generation CC 6.0")
-KOKKOS_ARCH_OPTION(PASCAL61        GPU  "NVIDIA Pascal generation CC 6.1")
-KOKKOS_ARCH_OPTION(VOLTA70         GPU  "NVIDIA Volta generation CC 7.0")
-KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2")
-KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5")
-KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0")
-KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6")
-KOKKOS_ARCH_OPTION(ZEN             HOST "AMD Zen architecture")
-KOKKOS_ARCH_OPTION(ZEN2            HOST "AMD Zen2 architecture")
-KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
-KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
-KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
-KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
-KOKKOS_ARCH_OPTION(VEGA90A         GPU  "AMD GPU MI200 GFX90A")
-KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
-KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
-KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
-KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
-KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")
 
+SET(HOST_ARCH_ALREADY_SPECIFIED "")
+MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL)
+  KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE)
+  IF(KOKKOS_ARCH_${ARCH})
+    IF(HOST_ARCH_ALREADY_SPECIFIED)
+      MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.")
+    ENDIF()
+    SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH})
+  ENDIF()
+ENDMACRO()
+
+DECLARE_AND_CHECK_HOST_ARCH(NATIVE            "local machine")
+DECLARE_AND_CHECK_HOST_ARCH(AMDAVX            "AMD chip")
+DECLARE_AND_CHECK_HOST_ARCH(ARMV80            "ARMv8.0 Compatible CPU")
+DECLARE_AND_CHECK_HOST_ARCH(ARMV81            "ARMv8.1 Compatible CPU")
+DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX    "ARMv8 Cavium ThunderX CPU")
+DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2   "ARMv8 Cavium ThunderX2 CPU")
+DECLARE_AND_CHECK_HOST_ARCH(A64FX             "ARMv8.2 with SVE Support")
+DECLARE_AND_CHECK_HOST_ARCH(WSM               "Intel Westmere CPU")
+DECLARE_AND_CHECK_HOST_ARCH(SNB               "Intel Sandy/Ivy Bridge CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(HSW               "Intel Haswell CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(BDW               "Intel Broadwell Xeon E-class CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(ICL               "Intel Ice Lake Client CPUs (AVX512)")
+DECLARE_AND_CHECK_HOST_ARCH(ICX               "Intel Ice Lake Xeon Server CPUs (AVX512)")
+DECLARE_AND_CHECK_HOST_ARCH(SKL               "Intel Skylake Client CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(SKX               "Intel Skylake Xeon Server CPUs (AVX512)")
+DECLARE_AND_CHECK_HOST_ARCH(KNC               "Intel Knights Corner Xeon Phi")
+DECLARE_AND_CHECK_HOST_ARCH(KNL               "Intel Knights Landing Xeon Phi")
+DECLARE_AND_CHECK_HOST_ARCH(SPR               "Intel Sapphire Rapids Xeon Server CPUs (AVX512)")
+DECLARE_AND_CHECK_HOST_ARCH(BGQ               "IBM Blue Gene Q")
+DECLARE_AND_CHECK_HOST_ARCH(POWER7            "IBM POWER7 CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(POWER8            "IBM POWER8 CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(POWER9            "IBM POWER9 CPUs")
+DECLARE_AND_CHECK_HOST_ARCH(ZEN               "AMD Zen architecture")
+DECLARE_AND_CHECK_HOST_ARCH(ZEN2              "AMD Zen2 architecture")
+DECLARE_AND_CHECK_HOST_ARCH(ZEN3              "AMD Zen3 architecture")
+
+IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
+  SET(KOKKOS_SHOW_CUDA_ARCHS ON)
+ENDIF()
+
+KOKKOS_ARCH_OPTION(KEPLER30        GPU  "NVIDIA Kepler generation CC 3.0"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(KEPLER32        GPU  "NVIDIA Kepler generation CC 3.2"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(KEPLER35        GPU  "NVIDIA Kepler generation CC 3.5"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(KEPLER37        GPU  "NVIDIA Kepler generation CC 3.7"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(MAXWELL50       GPU  "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(MAXWELL52       GPU  "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(MAXWELL53       GPU  "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(PASCAL60        GPU  "NVIDIA Pascal generation CC 6.0"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(PASCAL61        GPU  "NVIDIA Pascal generation CC 6.1"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(VOLTA70         GPU  "NVIDIA Volta generation CC 7.0"   "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
+
+IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
+  SET(KOKKOS_SHOW_HIP_ARCHS ON)
+ENDIF()
+
+KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900"      "KOKKOS_SHOW_HIP_ARCHS")
+KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906" "KOKKOS_SHOW_HIP_ARCHS")
+KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908"     "KOKKOS_SHOW_HIP_ARCHS")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "AMD GPU MI200 GFX90A"     "KOKKOS_SHOW_HIP_ARCHS")
+
+IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
+  SET(KOKKOS_SHOW_SYCL_ARCHS ON)
+ENDIF()
+
+KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU"                             "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9"                                   "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11"                                  "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP"                                "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP"                                  "KOKKOS_SHOW_SYCL_ARCHS")
+KOKKOS_ARCH_OPTION(INTEL_PVC       GPU  "Intel GPU Ponte Vecchio"                          "KOKKOS_SHOW_SYCL_ARCHS")
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
   SET(COMMON_WARNINGS
@@ -92,6 +124,13 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
   ENDIF()
 
+  # NVHPC compiler does not support -Wtype-limits.
+  IF(KOKKOS_ENABLE_OPENACC)
+    IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+      LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits")
+    ENDIF()
+  ENDIF()
+
   IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
     LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough")
   ENDIF()
@@ -102,12 +141,15 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough")
   ENDIF()
 
-  COMPILER_SPECIFIC_FLAGS(
-    COMPILER_ID CMAKE_CXX_COMPILER_ID
-    NVHPC       NO-VALUE-SPECIFIED
-    GNU         ${GNU_WARNINGS}
-    DEFAULT     ${COMMON_WARNINGS}
-  )
+  # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream
+  IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+    STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}")
+  ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC)
+    # FIXME_NVHPC
+  ELSE()
+    STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}")
+  ENDIF()
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}")
 ENDIF()
 
 
@@ -142,6 +184,10 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
   IF (KOKKOS_ENABLE_CUDA)
      SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE)
   ENDIF()
+ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(CUDA_ARCH_FLAG "-gpu")
+  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
+  GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   SET(CUDA_ARCH_FLAG "-arch")
 ENDIF()
@@ -162,10 +208,8 @@ ENDIF()
 #clear anything that might be in the cache
 GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 IF(KOKKOS_ENABLE_HIP)
-  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-    SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
-  ELSE()
-    SET(AMDGPU_ARCH_FLAG "--offload-arch")
+  SET(AMDGPU_ARCH_FLAG "--offload-arch")
+  IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
     GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip)
     IF(DEFINED ENV{ROCM_PATH})
       GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH})
@@ -174,6 +218,13 @@ IF(KOKKOS_ENABLE_HIP)
 ENDIF()
 
 
+IF(KOKKOS_ARCH_NATIVE)
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    DEFAULT -march=native -mtune=native
+  )
+ENDIF()
+
 IF (KOKKOS_ARCH_ARMV80)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
@@ -312,13 +363,22 @@ IF (KOKKOS_ARCH_KNL)
 ENDIF()
 
 IF (KOKKOS_ARCH_KNC)
-  SET(KOKKOS_USE_ISA_KNC ON)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     DEFAULT -mmic
   )
 ENDIF()
 
+IF (KOKKOS_ARCH_SKL)
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Intel   -xSKYLAKE
+    NVHPC   -tp=skylake
+    Cray    NO-VALUE-SPECIFIED
+    DEFAULT -march=skylake -mtune=skylake
+  )
+ENDIF()
+
 IF (KOKKOS_ARCH_SKX)
   #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
@@ -327,16 +387,32 @@ IF (KOKKOS_ARCH_SKX)
     Intel   -xCORE-AVX512
     NVHPC   -tp=skylake
     Cray    NO-VALUE-SPECIFIED
-    DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
+    DEFAULT -march=skylake-avx512 -mtune=skylake-avx512
   )
 ENDIF()
 
-IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2 OR KOKKOS_ARCH_ZEN3)
-  SET(KOKKOS_USE_ISA_X86_64 ON)
+IF (KOKKOS_ARCH_ICL)
+  SET(KOKKOS_ARCH_AVX512XEON ON)
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    DEFAULT -march=icelake-client -mtune=icelake-client
+  )
 ENDIF()
 
-IF (KOKKOS_ARCH_BDW OR KOKKOS_ARCH_SKX)
-  SET(KOKKOS_ENABLE_TM ON) #not a cache variable
+IF (KOKKOS_ARCH_ICX)
+  SET(KOKKOS_ARCH_AVX512XEON ON)
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    DEFAULT -march=icelake-server -mtune=icelake-server
+  )
+ENDIF()
+
+IF (KOKKOS_ARCH_SPR)
+  SET(KOKKOS_ARCH_AVX512XEON ON)
+  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    DEFAULT -march=sapphirerapids -mtune=sapphirerapids
+  )
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER7)
@@ -345,7 +421,6 @@ IF (KOKKOS_ARCH_POWER7)
     NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=power7 -mtune=power7
   )
-  SET(KOKKOS_USE_ISA_POWERPCBE ON)
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER8)
@@ -364,14 +439,15 @@ IF (KOKKOS_ARCH_POWER9)
   )
 ENDIF()
 
-IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9)
-  SET(KOKKOS_USE_ISA_POWERPCLE ON)
-ENDIF()
-
 IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
   COMPILER_SPECIFIC_FLAGS(
     Clang  -fcuda-rdc
     NVIDIA --relocatable-device-code=true
+    NVHPC -gpu=rdc
+  )
+ELSEIF(KOKKOS_ENABLE_CUDA)
+  COMPILER_SPECIFIC_FLAGS(
+    NVHPC -gpu=nordc
   )
 ENDIF()
 
@@ -421,8 +497,8 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
       MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.")
     ENDIF()
     SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE)
-    IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL)
-      MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
+    IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC)
+      MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.")
       UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE)
     ELSE()
       SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE)
@@ -430,9 +506,15 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
         string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG})
         SET(CMAKE_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE)
       ELSE()
-        GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
-        IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
-          GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
+        IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+          STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${FLAG})
+          GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}")
+          GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}")
+        ELSE()
+          GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
+          IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
+            GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
+          ENDIF()
         ENDIF()
       ENDIF()
     ENDIF()
@@ -529,13 +611,16 @@ ENDIF()
 IF(KOKKOS_ARCH_INTEL_XEHP)
   CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()
+IF(KOKKOS_ARCH_INTEL_PVC)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
 
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
     STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
     COMPILER_SPECIFIC_FLAGS(
-      Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
+      Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64
       XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
       NVHPC -gpu=${NVHPC_CUDA_ARCH}
     )
@@ -546,10 +631,47 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
       Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GPU)
+  IF (KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device xehp" -D__STRICT_ANSI__
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_PVC)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.4.0" -D__STRICT_ANSI__
+    )
+  ENDIF()
+ENDIF()
+
+IF (KOKKOS_ENABLE_OPENACC)
+  IF(KOKKOS_CUDA_ARCH_FLAG)
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
+    COMPILER_SPECIFIC_FLAGS(
+      NVHPC -acc -gpu=${NVHPC_CUDA_ARCH}
+    )
+  ELSE()
+    COMPILER_SPECIFIC_FLAGS(
+      NVHPC -acc
+    )
   ENDIF()
 ENDIF()
 
@@ -564,7 +686,7 @@ IF (KOKKOS_ENABLE_SYCL)
     ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9-"
+      DEFAULT -fsycl-targets=spir64
     )
   ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
     COMPILER_SPECIFIC_FLAGS(
@@ -586,6 +708,10 @@ IF (KOKKOS_ENABLE_SYCL)
     COMPILER_SPECIFIC_FLAGS(
       DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device xehp"
     )
+  ELSEIF(KOKKOS_ARCH_INTEL_PVC)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.4.0"
+    )
   ENDIF()
 ENDIF()
 
@@ -687,7 +813,7 @@ ENDIF()
 #Let's just always print things
 MESSAGE(STATUS "Built-in Execution Spaces:")
 
-FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL)
+FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC)
   STRING(TOUPPER ${_BACKEND} UC_BACKEND)
   IF(KOKKOS_ENABLE_${UC_BACKEND})
     IF(_DEVICE_PARALLEL)
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index f0c906e65..b9fe2ffab 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -37,12 +37,16 @@ IF(Kokkos_ENABLE_CUDA)
         PATHS           ${PROJECT_SOURCE_DIR}
         PATH_SUFFIXES   bin)
 
-    # check if compiler was set to nvcc_wrapper
+    # Check if compiler was set to nvcc_wrapper
     kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER})
-    # if launcher was found and nvcc_wrapper was not specified as
-    # compiler, set to use launcher. Will ensure CMAKE_CXX_COMPILER
-    # is replaced by nvcc_wrapper
-    IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
+    # If launcher was found and nvcc_wrapper was not specified as
+    # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher.
+    # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper
+    IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+      IF(CMAKE_CXX_COMPILER_LAUNCHER)
+       MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or
+clang++!")
+      ENDIF()
       # the first argument to launcher is always the C++ compiler defined by cmake
       # if the second argument matches the C++ compiler, it forwards the rest of the
       # args to nvcc_wrapper
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index dc3ee8c84..c7a454c21 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -86,6 +86,8 @@ IF(KOKKOS_ENABLE_OPENMP)
   ENDIF()
 ENDIF()
 
+KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend")
+
 KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend")
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(ClangOpenMPFlag -fopenmp=libomp)
diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake
index a581003b9..ea606bb0c 100644
--- a/packages/kokkos/cmake/kokkos_enable_options.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_options.cmake
@@ -61,6 +61,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke
 
 # This option will go away eventually, but allows fallback to old implementation when needed.
 KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation")
 
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index 02c9a911b..11fa9b302 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -57,7 +57,46 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING)
   # Make sure this appears in the cache with the appropriate DOCSTRING
   SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING})
 
-  #I don't love doing it this way because it's N^2 in number options, but cest la vie
+  #I don't love doing it this way because it's N^2 in number options, but c'est la vie
+  FOREACH(opt ${KOKKOS_GIVEN_VARIABLES})
+    STRING(TOUPPER ${opt} OPT_UC)
+    IF ("${OPT_UC}" STREQUAL "${UC_NAME}")
+      IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}")
+        IF (KOKKOS_HAS_TRILINOS)
+          #Allow this for now if Trilinos... we need to bootstrap our way to integration
+          MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}")
+          SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE)
+          UNSET(${opt} CACHE)
+        ELSE()
+          MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.")
+        ENDIF()
+      ENDIF()
+    ENDIF()
+  ENDFOREACH()
+
+  #okay, great, we passed the validation test - use the default
+  IF (DEFINED ${CAMEL_NAME})
+    SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE)
+  ELSE()
+    SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE)
+  ENDIF()
+ENDFUNCTION()
+
+INCLUDE (CMakeDependentOption)
+FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE)
+  SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX})
+  STRING(TOUPPER ${CAMEL_NAME} UC_NAME)
+
+  LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX})
+  SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
+  LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}")
+  SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
+  LIST(APPEND KOKKOS_OPTION_TYPES BOOL)
+  SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE)
+
+  CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE})
+
+  #I don't love doing it this way because it's N^2 in number options, but c'est la vie
   FOREACH(opt ${KOKKOS_GIVEN_VARIABLES})
     STRING(TOUPPER ${opt} OPT_UC)
     IF ("${OPT_UC}" STREQUAL "${UC_NAME}")
@@ -102,6 +141,8 @@ FUNCTION(kokkos_append_config_line LINE)
 ENDFUNCTION()
 
 MACRO(kokkos_export_cmake_tpl NAME)
+  cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN})
+
   #CMake TPLs are located with a call to find_package
   #find_package locates XConfig.cmake files through
   #X_DIR or X_ROOT variables set prior to calling find_package
@@ -125,7 +166,16 @@ MACRO(kokkos_export_cmake_tpl NAME)
     KOKKOS_APPEND_CONFIG_LINE("  SET(${NAME}_ROOT  ${${NAME}_ROOT})")
     KOKKOS_APPEND_CONFIG_LINE("ENDIF()")
   ENDIF()
-  KOKKOS_APPEND_CONFIG_LINE("FIND_DEPENDENCY(${NAME})")
+  SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}")
+
+  IF(KOKKOS_EXTRA_ARG_REQUIRED)
+    STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED")
+  ENDIF()
+  IF(KOKKOS_EXTRA_ARG_COMPONENTS)
+    STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}")
+  ENDIF()
+  STRING(APPEND KOKKOS_CONFIG_STRING ")")
+  KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING})
 ENDMACRO()
 
 MACRO(kokkos_export_imported_tpl NAME)
@@ -224,12 +274,6 @@ MACRO(kokkos_import_tpl NAME)
     SET(TPL_IMPORTED_NAME Kokkos::${NAME})
   ENDIF()
 
-  # Even though this policy gets set in the top-level CMakeLists.txt,
-  # I have still been getting errors about ROOT variables being ignored
-  # I'm not sure if this is a scope issue - but make sure
-  # the policy is set before we do any find_package calls
-  CMAKE_POLICY(SET CMP0074 NEW)
-
   IF (KOKKOS_ENABLE_${NAME})
     #Tack on a TPL here to make sure we avoid using anyone else's find
     FIND_PACKAGE(TPL${NAME} REQUIRED MODULE)
@@ -587,11 +631,16 @@ ENDMACRO()
 #
 #   ``LIBRARY <name>``
 #
-#     If specified, this gives the name of the library to look for
+#     If specified, this gives the name of the library to look for.
+#     The full path for the library found will be used as IMPORTED_LOCATION
+#     for the target created. Thus, this cannot be used for interface libraries.
 #
 #   ``LIBRARIES <name1> <name2> ...``
 #
-#     If specified, this gives a list of libraries to find for the package
+#     If specified, this gives a list of libraries to find for the package.
+#     As opposed to the LIBRARY argument, this can be used with interface
+#     libraries. In that case, we directly use the names provided here
+#     for linking when creating the new target.
 #
 #   ``LIBRARY_PATHS <path1> <path2> ...``
 #
@@ -707,6 +756,7 @@ MACRO(kokkos_find_imported NAME)
     SET(IMPORT_TYPE)
     IF (TPL_INTERFACE)
       SET(IMPORT_TYPE "INTERFACE")
+      SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES})
     ENDIF()
     KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME}
       ${IMPORT_TYPE}
@@ -790,15 +840,15 @@ FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
     SET(COMPILER ${KOKKOS_CXX_COMPILER_ID})
   ENDIF()
 
-  SET(COMPILER_SPECIFIC_FLAGS_TMP)
+  SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT})
   FOREACH(COMP ${COMPILERS})
     IF (COMPILER STREQUAL "${COMP}")
       IF (PARSE_${COMPILER})
-        IF (NOT "${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED")
+        IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED")
+           SET(COMPILER_SPECIFIC_FLAGS_TMP "")
+        ELSE()
            SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}})
         ENDIF()
-      ELSEIF(PARSE_DEFAULT)
-        SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT})
       ENDIF()
     ENDIF()
   ENDFOREACH()
diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake
index 015873ebd..03f1a0d18 100644
--- a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake
@@ -1,13 +1,15 @@
 # From CMake 3.10 documentation
 
 #This can run at any time
-KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 14")
+KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 14")
 
 # Set CXX standard flags
 SET(KOKKOS_ENABLE_CXX14 OFF)
 SET(KOKKOS_ENABLE_CXX17 OFF)
 SET(KOKKOS_ENABLE_CXX20 OFF)
 IF (KOKKOS_CXX_STANDARD)
+  MESSAGE(DEPRECATION "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead")
+
   IF (${KOKKOS_CXX_STANDARD} STREQUAL "c++98")
     MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++14")
   ELSEIF (${KOKKOS_CXX_STANDARD} STREQUAL "c++11")
diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
index 1eb0592c7..a5a8f40af 100644
--- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -128,8 +128,8 @@ IF(KOKKOS_ENABLE_CUDA)
     ELSEIF(CMAKE_CXX_EXTENSIONS)
       MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions.  Set -DCMAKE_CXX_EXTENSIONS=OFF")
     ENDIF()
-  ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
-    MESSAGE(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}")
+  ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+    MESSAGE(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}")
   ENDIF()
 ENDIF()
 
diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake
index 54c6b520b..2c712a519 100644
--- a/packages/kokkos/cmake/kokkos_tpls.cmake
+++ b/packages/kokkos/cmake/kokkos_tpls.cmake
@@ -85,6 +85,11 @@ ENDIF()
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
 KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL)
+  find_package(desul REQUIRED COMPONENTS atomics)
+  KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics)
+ENDIF()
+
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
 #Convert to a regular variable
diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake
index 1ec45d19b..34e45ecf7 100644
--- a/packages/kokkos/cmake/kokkos_tribits.cmake
+++ b/packages/kokkos/cmake/kokkos_tribits.cmake
@@ -88,6 +88,9 @@ MACRO(KOKKOS_PROCESS_SUBPACKAGES)
     ADD_SUBDIRECTORY(core)
     ADD_SUBDIRECTORY(containers)
     ADD_SUBDIRECTORY(algorithms)
+    if (KOKKOS_CXX_STANDARD GREATER_EQUAL 17)
+      ADD_SUBDIRECTORY(simd)
+    endif()
     ADD_SUBDIRECTORY(example)
   endif()
 ENDMACRO()
@@ -342,7 +345,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES)
   INSTALL(PROGRAMS
           "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper"
           "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind"
-          "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler"
           "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler"
           DESTINATION ${CMAKE_INSTALL_BINDIR})
   INSTALL(FILES
diff --git a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index 16b74a499..77451bb9e 100644
--- a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -140,9 +140,7 @@ struct find_test {
   void init(value_type& v) const { v = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, volatile value_type const& src) const {
-    dst += src;
-  }
+  void join(value_type& dst, value_type const& src) const { dst += src; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(size_type i, value_type& num_errors) const {
diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
index 8a23f59d3..bd06be966 100644
--- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -82,8 +82,8 @@ void test_scatter_view(int m, int n) {
         Kokkos::Timer timer;
         timer.reset();
         for (int k = 0; k < m; ++k) {
-          Kokkos::parallel_for(policy, f2,
-                               "hand_coded_duplicate_scatter_view_test");
+          Kokkos::parallel_for("hand_coded_duplicate_scatter_view_test", policy,
+                               f2);
         }
         Kokkos::fence();
         auto t = timer.seconds();
@@ -102,7 +102,7 @@ void test_scatter_view(int m, int n) {
         Kokkos::Timer timer;
         timer.reset();
         for (int k = 0; k < m; ++k) {
-          Kokkos::parallel_for(policy, f, "scatter_view_test");
+          Kokkos::parallel_for("scatter_view_test", policy, f);
         }
         Kokkos::fence();
         auto t = timer.seconds();
diff --git a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index 4547d5c35..8ff208d6a 100644
--- a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -147,7 +147,7 @@ struct UnorderedMapTest {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
+  void join(value_type& dst, const value_type& src) const {
     dst.failed_count += src.failed_count;
     dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list;
   }
diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
index ea73c4f53..05121b4e5 100644
--- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -44,14 +44,16 @@
 
 #ifndef KOKKOS_BITSET_HPP
 #define KOKKOS_BITSET_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_BITSET
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Functional.hpp>
 
 #include <impl/Kokkos_Bitset_impl.hpp>
 
-#include <stdexcept>
-
 namespace Kokkos {
 
 template <typename Device = Kokkos::DefaultExecutionSpace>
@@ -403,7 +405,7 @@ class ConstBitset {
 template <typename DstDevice, typename SrcDevice>
 void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src) {
   if (dst.size() != src.size()) {
-    throw std::runtime_error(
+    Kokkos::Impl::throw_runtime_exception(
         "Error: Cannot deep_copy bitsets of different sizes!");
   }
 
@@ -418,7 +420,7 @@ void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src) {
 template <typename DstDevice, typename SrcDevice>
 void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
   if (dst.size() != src.size()) {
-    throw std::runtime_error(
+    Kokkos::Impl::throw_runtime_exception(
         "Error: Cannot deep_copy bitsets of different sizes!");
   }
 
@@ -433,7 +435,7 @@ void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
 template <typename DstDevice, typename SrcDevice>
 void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
   if (dst.size() != src.size()) {
-    throw std::runtime_error(
+    Kokkos::Impl::throw_runtime_exception(
         "Error: Cannot deep_copy bitsets of different sizes!");
   }
 
@@ -447,4 +449,8 @@ void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_BITSET
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_BITSET
+#endif
 #endif  // KOKKOS_BITSET_HPP
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
index 8c80ec55b..916c54d60 100644
--- a/packages/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -50,6 +50,10 @@
 
 #ifndef KOKKOS_DUALVIEW_HPP
 #define KOKKOS_DUALVIEW_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DUALVIEW
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
@@ -144,7 +148,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   using t_dev_const_randomread =
       View<typename traits::const_data_type, typename traits::array_layout,
            typename traits::device_type,
-           Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
+           Kokkos::MemoryTraits<Kokkos::RandomAccess>>;
 
   /// \typedef t_host_const_randomread
   /// \brief The type of a const, random-access View host mirror of
@@ -175,7 +179,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   using t_dev_const_randomread_um =
       View<typename t_host::const_data_type, typename t_host::array_layout,
            typename t_host::device_type,
-           Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+           Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   /// \typedef t_host_const_randomread
   /// \brief The type of a const, random-access View host mirror of
@@ -232,7 +236,9 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
            const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
            const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
            const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
-      : modified_flags(t_modified_flags("DualView::modified_flags")),
+      : modified_flags(
+            Kokkos::view_alloc(typename t_modified_flags::execution_space{},
+                               "DualView::modified_flags")),
         d_view(label, n0, n1, n2, n3, n4, n5, n6, n7),
         h_view(create_mirror_view(d_view))  // without UVM, host View mirrors
   {}
@@ -249,16 +255,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// omit the integer arguments that follow.
   template <class... P>
   DualView(const Impl::ViewCtorProp<P...>& arg_prop,
-           typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer,
-                                   size_t>::type const n0 =
-               KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
-           const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
+           std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer,
+                            size_t> const n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n1                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n2                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n3                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n4                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n5                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n6                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+           const size_t n7                   = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : modified_flags(t_modified_flags("DualView::modified_flags")),
         d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) {
     // without UVM, host View mirrors
@@ -403,7 +408,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                   impl_device_matches_tdev_exec<Device>::value, t_dev,
                   typename std::conditional_t<
                       impl_device_matches_tdev_memory_space<Device>::value,
-                      t_dev, t_host> > > > >
+                      t_dev, t_host>>>>>
   view() const {
     constexpr bool device_is_memspace =
         std::is_same<Device, typename Device::memory_space>::value;
@@ -609,21 +614,21 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   }
 
   template <class Device>
-  void sync(const typename std::enable_if<
+  void sync(const std::enable_if_t<
                 (std::is_same<typename traits::data_type,
                               typename traits::non_const_data_type>::value) ||
                     (std::is_same<Device, int>::value),
-                int>::type& = 0) {
+                int>& = 0) {
     sync_impl<Device>(std::true_type{});
   }
 
   template <class Device, class ExecutionSpace>
   void sync(const ExecutionSpace& exec,
-            const typename std::enable_if<
+            const std::enable_if_t<
                 (std::is_same<typename traits::data_type,
                               typename traits::non_const_data_type>::value) ||
                     (std::is_same<Device, int>::value),
-                int>::type& = 0) {
+                int>& = 0) {
     sync_impl<Device>(std::true_type{}, exec);
   }
 
@@ -651,20 +656,20 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   }
 
   template <class Device>
-  void sync(const typename std::enable_if<
+  void sync(const std::enable_if_t<
                 (!std::is_same<typename traits::data_type,
                                typename traits::non_const_data_type>::value) ||
                     (std::is_same<Device, int>::value),
-                int>::type& = 0) {
+                int>& = 0) {
     sync_impl<Device>(std::false_type{});
   }
   template <class Device, class ExecutionSpace>
   void sync(const ExecutionSpace& exec,
-            const typename std::enable_if<
+            const std::enable_if_t<
                 (!std::is_same<typename traits::data_type,
                                typename traits::non_const_data_type>::value) ||
                     (std::is_same<Device, int>::value),
-                int>::type& = 0) {
+                int>& = 0) {
     sync_impl<Device>(std::false_type{}, exec);
   }
 
@@ -786,7 +791,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
             std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
                 nullptr>
   void modify() {
-    if (modified_flags.data() == nullptr) return;
+    if (modified_flags.data() == nullptr) {
+      modified_flags = t_modified_flags("DualView::modified_flags");
+    }
+
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -899,21 +907,55 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// This discards any existing contents of the objects, and resets
   /// their modified flags.  It does <i>not</i> copy the old contents
   /// of either View into the new View objects.
-  template <class... I>
+  template <class... ViewCtorArgs>
   void impl_realloc(const size_t n0, const size_t n1, const size_t n2,
                     const size_t n3, const size_t n4, const size_t n5,
-                    const size_t n6, const size_t n7, const I&... arg_prop) {
+                    const size_t n6, const size_t n7,
+                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+    using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+    static_assert(!alloc_prop_input::has_label,
+                  "The view constructor arguments passed to Kokkos::realloc "
+                  "must not include a label!");
+    static_assert(
+        !alloc_prop_input::has_pointer,
+        "The view constructor arguments passed to Kokkos::realloc must "
+        "not include a pointer!");
+    static_assert(
+        !alloc_prop_input::has_memory_space,
+        "The view constructor arguments passed to Kokkos::realloc must "
+        "not include a memory space instance!");
+
     const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
     const bool sizeMismatch =
         Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
 
     if (sizeMismatch) {
-      ::Kokkos::realloc(arg_prop..., d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(arg_prop..., typename t_host::memory_space(),
-                                  d_view);
-    } else if (!Kokkos::Impl::has_type<Kokkos::Impl::WithoutInitializing_t,
-                                       I...>::value) {
-      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
+      ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      if (alloc_prop_input::initialize) {
+        h_view = create_mirror_view(typename t_host::memory_space(), d_view);
+      } else {
+        h_view = create_mirror_view(Kokkos::WithoutInitializing,
+                                    typename t_host::memory_space(), d_view);
+      }
+    } else if (alloc_prop_input::initialize) {
+      if (alloc_prop_input::has_execution_space) {
+        // Add execution_space if not provided to avoid need for if constexpr
+        using alloc_prop = Impl::ViewCtorProp<
+            ViewCtorArgs...,
+            std::conditional_t<alloc_prop_input::has_execution_space,
+                               std::integral_constant<unsigned int, 2>,
+                               typename t_dev::execution_space>>;
+        alloc_prop arg_prop_copy(arg_prop);
+        using execution_space_type = typename alloc_prop::execution_space;
+        const execution_space_type& exec_space =
+            static_cast<
+                Kokkos::Impl::ViewCtorProp<void, execution_space_type> const&>(
+                arg_prop_copy)
+                .value;
+        ::Kokkos::deep_copy(exec_space, d_view, typename t_dev::value_type{});
+      } else
+        ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
     }
 
     /* Reset dirty flags */
@@ -923,6 +965,19 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
       modified_flags(1) = modified_flags(0) = 0;
   }
 
+  template <class... ViewCtorArgs>
+  void realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+               const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    impl_realloc(n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+  }
+
   void realloc(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -931,7 +986,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    impl_realloc(n0, n1, n2, n3, n4, n5, n6, n7);
+    impl_realloc(n0, n1, n2, n3, n4, n5, n6, n7, Impl::ViewCtorProp<>{});
   }
 
   template <typename I>
@@ -944,17 +999,32 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
       const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    impl_realloc(n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+    impl_realloc(n0, n1, n2, n3, n4, n5, n6, n7, Kokkos::view_alloc(arg_prop));
   }
 
   /// \brief Resize both views, copying old contents into new if necessary.
   ///
   /// This method only copies the old contents into the new View
   /// objects for the device which was last marked as modified.
-  template <class... I>
-  void impl_resize(const size_t n0, const size_t n1, const size_t n2,
+  template <class... ViewCtorArgs>
+  void impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+                   const size_t n0, const size_t n1, const size_t n2,
                    const size_t n3, const size_t n4, const size_t n5,
-                   const size_t n6, const size_t n7, const I&... arg_prop) {
+                   const size_t n6, const size_t n7) {
+    using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+    static_assert(!alloc_prop_input::has_label,
+                  "The view constructor arguments passed to Kokkos::resize "
+                  "must not include a label!");
+    static_assert(
+        !alloc_prop_input::has_pointer,
+        "The view constructor arguments passed to Kokkos::resize must "
+        "not include a pointer!");
+    static_assert(
+        !alloc_prop_input::has_memory_space,
+        "The view constructor arguments passed to Kokkos::resize must "
+        "not include a memory space instance!");
+
     const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
     const bool sizeMismatch =
         Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
@@ -965,22 +1035,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     if (modified_flags(1) >= modified_flags(0)) {
       /* Resize on Device */
       if (sizeMismatch) {
-        ::Kokkos::resize(arg_prop..., d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-        h_view = create_mirror_view(arg_prop...,
-                                    typename t_host::memory_space(), d_view);
+        ::Kokkos::resize(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        if (alloc_prop_input::initialize) {
+          h_view = create_mirror_view(typename t_host::memory_space(), d_view);
+        } else {
+          h_view = create_mirror_view(Kokkos::WithoutInitializing,
+                                      typename t_host::memory_space(), d_view);
+        }
 
         /* Mark Device copy as modified */
-        modified_flags(1) = modified_flags(1) + 1;
+        ++modified_flags(1);
       }
     } else {
-      /* Realloc on Device */
+      /* Resize on Host */
       if (sizeMismatch) {
-        ::Kokkos::resize(arg_prop..., h_view, n0, n1, n2, n3, n4, n5, n6, n7);
-        d_view = create_mirror_view(arg_prop..., typename t_dev::memory_space(),
-                                    h_view);
+        ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        if (alloc_prop_input::initialize) {
+          d_view = create_mirror_view(typename t_dev::memory_space(), h_view);
+
+        } else {
+          d_view = create_mirror_view(Kokkos::WithoutInitializing,
+                                      typename t_dev::memory_space(), h_view);
+        }
 
         /* Mark Host copy as modified */
-        modified_flags(0) = modified_flags(0) + 1;
+        ++modified_flags(0);
       }
     }
   }
@@ -993,7 +1072,20 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    impl_resize(n0, n1, n2, n3, n4, n5, n6, n7);
+    impl_resize(Impl::ViewCtorProp<>{}, n0, n1, n2, n3, n4, n5, n6, n7);
+  }
+
+  template <class... ViewCtorArgs>
+  void resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+              const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    impl_resize(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7);
   }
 
   template <class I>
@@ -1006,7 +1098,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
       const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    impl_resize(n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+    impl_resize(Kokkos::view_alloc(arg_prop), n0, n1, n2, n3, n4, n5, n6, n7);
   }
 
   //@}
@@ -1027,16 +1119,16 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      extent(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  extent(const iType& r) const {
     return d_view.extent(r);
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, int>::type
-      extent_int(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, int>
+  extent_int(const iType& r) const {
     return static_cast<int>(d_view.extent(r));
   }
 
@@ -1130,6 +1222,15 @@ void resize(DualView<Properties...>& dv, Args&&... args) noexcept(
   dv.resize(std::forward<Args>(args)...);
 }
 
+template <class... ViewCtorArgs, class... Properties, class... Args>
+void resize(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    DualView<Properties...>& dv,
+    Args&&... args) noexcept(noexcept(dv.resize(arg_prop,
+                                                std::forward<Args>(args)...))) {
+  dv.resize(arg_prop, std::forward<Args>(args)...);
+}
+
 template <class I, class... Properties, class... Args>
 std::enable_if_t<Impl::is_view_ctor_property<I>::value> resize(
     const I& arg_prop, DualView<Properties...>& dv,
@@ -1138,6 +1239,15 @@ std::enable_if_t<Impl::is_view_ctor_property<I>::value> resize(
   dv.resize(arg_prop, std::forward<Args>(args)...);
 }
 
+template <class... ViewCtorArgs, class... Properties, class... Args>
+void realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+             DualView<Properties...>& dv,
+             Args&&... args) noexcept(noexcept(dv
+                                                   .realloc(std::forward<Args>(
+                                                       args)...))) {
+  dv.realloc(arg_prop, std::forward<Args>(args)...);
+}
+
 template <class... Properties, class... Args>
 void realloc(DualView<Properties...>& dv, Args&&... args) noexcept(
     noexcept(dv.realloc(std::forward<Args>(args)...))) {
@@ -1155,4 +1265,8 @@ std::enable_if_t<Impl::is_view_ctor_property<I>::value> realloc(
 
 }  // end namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DUALVIEW
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DUALVIEW
+#endif
 #endif
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index 176129d25..442f0d861 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -50,6 +50,10 @@
 
 #ifndef KOKKOS_DYNRANKVIEW_HPP
 #define KOKKOS_DYNRANKVIEW_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNRANKVIEW
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
@@ -117,10 +121,10 @@ struct DynRankDimTraits {
   // Create the layout for the rank-7 view.
   // Non-strided Layout
   template <typename Layout>
-  KOKKOS_INLINE_FUNCTION static typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<
       (std::is_same<Layout, Kokkos::LayoutRight>::value ||
        std::is_same<Layout, Kokkos::LayoutLeft>::value),
-      Layout>::type
+      Layout>
   createLayout(const Layout& layout) {
     return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1,
                   layout.dimension[1] != unspecified ? layout.dimension[1] : 1,
@@ -134,8 +138,8 @@ struct DynRankDimTraits {
 
   // LayoutStride
   template <typename Layout>
-  KOKKOS_INLINE_FUNCTION static typename std::enable_if<
-      (std::is_same<Layout, Kokkos::LayoutStride>::value), Layout>::type
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<
+      (std::is_same<Layout, Kokkos::LayoutStride>::value), Layout>
   createLayout(const Layout& layout) {
     return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1,
                   layout.stride[0],
@@ -157,13 +161,13 @@ struct DynRankDimTraits {
 
   // Extra overload to match that for specialize types
   template <typename Traits, typename... P>
-  KOKKOS_INLINE_FUNCTION static typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<
       (std::is_same<typename Traits::array_layout,
                     Kokkos::LayoutRight>::value ||
        std::is_same<typename Traits::array_layout, Kokkos::LayoutLeft>::value ||
        std::is_same<typename Traits::array_layout,
                     Kokkos::LayoutStride>::value),
-      typename Traits::array_layout>::type
+      typename Traits::array_layout>
   createLayout(const Kokkos::Impl::ViewCtorProp<P...>& /* prop */,
                const typename Traits::array_layout& layout) {
     return createLayout(layout);
@@ -187,12 +191,12 @@ struct DynRankDimTraits {
 
 // Non-strided Layout
 template <typename Layout, typename iType>
-KOKKOS_INLINE_FUNCTION static
-    typename std::enable_if<(std::is_same<Layout, Kokkos::LayoutRight>::value ||
-                             std::is_same<Layout, Kokkos::LayoutLeft>::value) &&
-                                std::is_integral<iType>::value,
-                            Layout>::type
-    reconstructLayout(const Layout& layout, iType dynrank) {
+KOKKOS_INLINE_FUNCTION static std::enable_if_t<
+    (std::is_same<Layout, Kokkos::LayoutRight>::value ||
+     std::is_same<Layout, Kokkos::LayoutLeft>::value) &&
+        std::is_integral<iType>::value,
+    Layout>
+reconstructLayout(const Layout& layout, iType dynrank) {
   return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX,
                 dynrank > 1 ? layout.dimension[1] : KOKKOS_INVALID_INDEX,
                 dynrank > 2 ? layout.dimension[2] : KOKKOS_INVALID_INDEX,
@@ -205,10 +209,10 @@ KOKKOS_INLINE_FUNCTION static
 
 // LayoutStride
 template <typename Layout, typename iType>
-KOKKOS_INLINE_FUNCTION static typename std::enable_if<
+KOKKOS_INLINE_FUNCTION static std::enable_if_t<
     (std::is_same<Layout, Kokkos::LayoutStride>::value) &&
         std::is_integral<iType>::value,
-    Layout>::type
+    Layout>
 reconstructLayout(const Layout& layout, iType dynrank) {
   return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX,
                 dynrank > 0 ? layout.stride[0] : (0),
@@ -308,26 +312,25 @@ namespace Impl {
 template <class DstTraits, class SrcTraits>
 class ViewMapping<
     DstTraits, SrcTraits,
-    typename std::enable_if<
-        (std::is_same<typename DstTraits::memory_space,
-                      typename SrcTraits::memory_space>::value &&
-         std::is_same<typename DstTraits::specialize, void>::value &&
-         std::is_same<typename SrcTraits::specialize, void>::value &&
-         (std::is_same<typename DstTraits::array_layout,
-                       typename SrcTraits::array_layout>::value ||
-          ((std::is_same<typename DstTraits::array_layout,
-                         Kokkos::LayoutLeft>::value ||
-            std::is_same<typename DstTraits::array_layout,
-                         Kokkos::LayoutRight>::value ||
-            std::is_same<typename DstTraits::array_layout,
-                         Kokkos::LayoutStride>::value) &&
-           (std::is_same<typename SrcTraits::array_layout,
-                         Kokkos::LayoutLeft>::value ||
-            std::is_same<typename SrcTraits::array_layout,
-                         Kokkos::LayoutRight>::value ||
-            std::is_same<typename SrcTraits::array_layout,
-                         Kokkos::LayoutStride>::value)))),
-        Kokkos::Impl::ViewToDynRankViewTag>::type> {
+    std::enable_if_t<(std::is_same<typename DstTraits::memory_space,
+                                   typename SrcTraits::memory_space>::value &&
+                      std::is_void<typename DstTraits::specialize>::value &&
+                      std::is_void<typename SrcTraits::specialize>::value &&
+                      (std::is_same<typename DstTraits::array_layout,
+                                    typename SrcTraits::array_layout>::value ||
+                       ((std::is_same<typename DstTraits::array_layout,
+                                      Kokkos::LayoutLeft>::value ||
+                         std::is_same<typename DstTraits::array_layout,
+                                      Kokkos::LayoutRight>::value ||
+                         std::is_same<typename DstTraits::array_layout,
+                                      Kokkos::LayoutStride>::value) &&
+                        (std::is_same<typename SrcTraits::array_layout,
+                                      Kokkos::LayoutLeft>::value ||
+                         std::is_same<typename SrcTraits::array_layout,
+                                      Kokkos::LayoutRight>::value ||
+                         std::is_same<typename SrcTraits::array_layout,
+                                      Kokkos::LayoutStride>::value)))),
+                     Kokkos::Impl::ViewToDynRankViewTag>> {
  private:
   enum {
     is_assignable_value_type =
@@ -397,7 +400,7 @@ template <class>
 struct is_dyn_rank_view : public std::false_type {};
 
 template <class D, class... P>
-struct is_dyn_rank_view<Kokkos::DynRankView<D, P...> > : public std::true_type {
+struct is_dyn_rank_view<Kokkos::DynRankView<D, P...>> : public std::true_type {
 };
 
 template <typename DataType, class... Properties>
@@ -465,23 +468,20 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   //  enum?
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      extent(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  extent(const iType& r) const {
     return m_map.extent(r);
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, int>::type
-      extent_int(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, int>
+  extent_int(const iType& r) const {
     return static_cast<int>(m_map.extent(r));
   }
 
-  KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout()
-      const {
-    return m_map.layout();
-  }
+  KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const;
 
   //----------------------------------------
   /*  Deprecate all 'dimension' functions in favor of
@@ -567,7 +567,7 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     is_layout_stride = std::is_same<typename traits::array_layout,
                                     Kokkos::LayoutStride>::value,
 
-    is_default_map = std::is_same<typename traits::specialize, void>::value &&
+    is_default_map = std::is_void<typename traits::specialize>::value &&
                      (is_layout_left || is_layout_right || is_layout_stride)
   };
 
@@ -611,11 +611,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // This assumes a contiguous underlying memory (i.e. no padding, no
   // striding...)
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
       std::is_same<typename drvtraits::value_type,
                    typename drvtraits::scalar_array_type>::value &&
           std::is_integral<iType>::value,
-      reference_type>::type
+      reference_type>
   operator[](const iType& i0) const {
     // Phalanx is violating this, since they use the operator to access ALL
     // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 ,
@@ -626,11 +626,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // This assumes a contiguous underlying memory (i.e. no padding, no
   // striding... AND a Trilinos/Sacado scalar type )
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
       !std::is_same<typename drvtraits::value_type,
                     typename drvtraits::scalar_array_type>::value &&
           std::is_integral<iType>::value,
-      reference_type>::type
+      reference_type>
   operator[](const iType& i0) const {
     //      auto map = impl_map();
     const size_t dim_scalar = m_map.dimension_scalar();
@@ -640,60 +640,60 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
         DataType*, typename traits::array_layout, typename traits::device_type,
         Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged |
                              traits::memory_traits::is_random_access |
-                             traits::memory_traits::is_atomic> >;
+                             traits::memory_traits::is_atomic>>;
     tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
     return rankone_view(i0);
   }
 
   // Rank 1 parenthesis
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
-       std::is_integral<iType>::value),
-      reference_type>::type
-  operator()(const iType& i0) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<(std::is_void<typename traits::specialize>::value &&
+                        std::is_integral<iType>::value),
+                       reference_type>
+      operator()(const iType& i0) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0))
     return m_map.reference(i0);
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename traits::specialize, void>::value &&
-        std::is_integral<iType>::value),
-      reference_type>::type
-  operator()(const iType& i0) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename traits::specialize>::value &&
+                         std::is_integral<iType>::value),
+                       reference_type>
+      operator()(const iType& i0) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0))
     return m_map.reference(i0, 0, 0, 0, 0, 0, 0);
   }
 
   // Rank 2
   template <typename iType0, typename iType1>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1))
     return m_map.reference(i0, i1);
   }
 
   template <typename iType0, typename iType1>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  operator()(const iType0& i0, const iType1& i1) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      operator()(const iType0& i0, const iType1& i1) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1))
     return m_map.reference(i0, i1, 0, 0, 0, 0, 0);
   }
 
   // Rank 3
   template <typename iType0, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1, const iType2& i2) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (3, this->rank(), m_track, m_map, i0, i1, i2))
@@ -701,11 +701,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType0, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  operator()(const iType0& i0, const iType1& i1, const iType2& i2) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      operator()(const iType0& i0, const iType1& i1, const iType2& i2) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (3, this->rank(), m_track, m_map, i0, i1, i2))
     return m_map.reference(i0, i1, i2, 0, 0, 0, 0);
@@ -713,11 +713,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   // Rank 4
   template <typename iType0, typename iType1, typename iType2, typename iType3>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1, const iType2& i2,
              const iType3& i3) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -726,12 +726,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType0, typename iType1, typename iType2, typename iType3>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-             const iType3& i3) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      operator()(const iType0& i0, const iType1& i1, const iType2& i2,
+                 const iType3& i3) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (4, this->rank(), m_track, m_map, i0, i1, i2, i3))
     return m_map.reference(i0, i1, i2, i3, 0, 0, 0);
@@ -740,12 +740,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 5
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1, const iType2& i2,
              const iType3& i3, const iType4& i4) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -755,12 +755,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-             const iType3& i3, const iType4& i4) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      operator()(const iType0& i0, const iType1& i1, const iType2& i2,
+                 const iType3& i3, const iType4& i4) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4))
     return m_map.reference(i0, i1, i2, i3, i4, 0, 0);
@@ -769,12 +769,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 6
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value && std::is_integral<iType5>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1, const iType2& i2,
              const iType3& i3, const iType4& i4, const iType5& i5) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -784,12 +784,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  operator()(const iType0& i0, const iType1& i1, const iType2& i2,
-             const iType3& i3, const iType4& i4, const iType5& i5) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      operator()(const iType0& i0, const iType1& i1, const iType2& i2,
+                 const iType3& i3, const iType4& i4, const iType5& i5) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5))
     return m_map.reference(i0, i1, i2, i3, i4, i5, 0);
@@ -798,12 +798,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 7
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5, typename iType6>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
       (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value && std::is_integral<iType5>::value &&
        std::is_integral<iType6>::value),
-      reference_type>::type
+      reference_type>
   operator()(const iType0& i0, const iType1& i1, const iType2& i2,
              const iType3& i3, const iType4& i4, const iType5& i5,
              const iType6& i6) const {
@@ -823,53 +823,53 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 1
   // Rank 1 parenthesis
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
-       std::is_integral<iType>::value),
-      reference_type>::type
-  access(const iType& i0) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<(std::is_void<typename traits::specialize>::value &&
+                        std::is_integral<iType>::value),
+                       reference_type>
+      access(const iType& i0) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0))
     return m_map.reference(i0);
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename traits::specialize, void>::value &&
-        std::is_integral<iType>::value),
-      reference_type>::type
-  access(const iType& i0) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename traits::specialize>::value &&
+                         std::is_integral<iType>::value),
+                       reference_type>
+      access(const iType& i0) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0))
     return m_map.reference(i0, 0, 0, 0, 0, 0, 0);
   }
 
   // Rank 2
   template <typename iType0, typename iType1>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1))
     return m_map.reference(i0, i1);
   }
 
   template <typename iType0, typename iType1>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  access(const iType0& i0, const iType1& i1) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      access(const iType0& i0, const iType1& i1) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1))
     return m_map.reference(i0, i1, 0, 0, 0, 0, 0);
   }
 
   // Rank 3
   template <typename iType0, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1, const iType2& i2) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (3, this->rank(), m_track, m_map, i0, i1, i2))
@@ -877,11 +877,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType0, typename iType1, typename iType2>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  access(const iType0& i0, const iType1& i1, const iType2& i2) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      access(const iType0& i0, const iType1& i1, const iType2& i2) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (3, this->rank(), m_track, m_map, i0, i1, i2))
     return m_map.reference(i0, i1, i2, 0, 0, 0, 0);
@@ -889,11 +889,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   // Rank 4
   template <typename iType0, typename iType1, typename iType2, typename iType3>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1, const iType2& i2,
          const iType3& i3) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -902,12 +902,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType0, typename iType1, typename iType2, typename iType3>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  access(const iType0& i0, const iType1& i1, const iType2& i2,
-         const iType3& i3) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      access(const iType0& i0, const iType1& i1, const iType2& i2,
+             const iType3& i3) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (4, this->rank(), m_track, m_map, i0, i1, i2, i3))
     return m_map.reference(i0, i1, i2, i3, 0, 0, 0);
@@ -916,12 +916,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 5
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3,
          const iType4& i4) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -931,12 +931,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3,
-         const iType4& i4) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      access(const iType0& i0, const iType1& i1, const iType2& i2,
+             const iType3& i3, const iType4& i4) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4))
     return m_map.reference(i0, i1, i2, i3, i4, 0, 0);
@@ -945,12 +945,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 6
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      (std::is_same<typename traits::specialize, void>::value &&
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
+      (std::is_void<typename traits::specialize>::value &&
        std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value && std::is_integral<iType5>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3,
          const iType4& i4, const iType5& i5) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -960,12 +960,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
-      !(std::is_same<typename drvtraits::specialize, void>::value &&
-        std::is_integral<iType0>::value),
-      reference_type>::type
-  access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3,
-         const iType4& i4, const iType5& i5) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value &&
+                         std::is_integral<iType0>::value),
+                       reference_type>
+      access(const iType0& i0, const iType1& i1, const iType2& i2,
+             const iType3& i3, const iType4& i4, const iType5& i5) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
         (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5))
     return m_map.reference(i0, i1, i2, i3, i4, i5, 0);
@@ -974,12 +974,12 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   // Rank 7
   template <typename iType0, typename iType1, typename iType2, typename iType3,
             typename iType4, typename iType5, typename iType6>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
       (std::is_integral<iType0>::value && std::is_integral<iType1>::value &&
        std::is_integral<iType2>::value && std::is_integral<iType3>::value &&
        std::is_integral<iType4>::value && std::is_integral<iType5>::value &&
        std::is_integral<iType6>::value),
-      reference_type>::type
+      reference_type>
   access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3,
          const iType4& i4, const iType5& i5, const iType6& i6) const {
     KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
@@ -1092,9 +1092,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit inline DynRankView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout)
+      std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout)
       : m_track(),
         m_map(),
         m_rank(Impl::DynRankDimTraits<typename traits::specialize>::
@@ -1107,17 +1106,14 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     // to avoid duplicate class error.
     using alloc_prop = Kokkos::Impl::ViewCtorProp<
         P...,
-        typename std::conditional<alloc_prop_input::has_label,
-                                  std::integral_constant<unsigned, 0>,
-                                  typename std::string>::type,
-        typename std::conditional<
-            alloc_prop_input::has_memory_space,
-            std::integral_constant<unsigned, 1>,
-            typename traits::device_type::memory_space>::type,
-        typename std::conditional<
-            alloc_prop_input::has_execution_space,
-            std::integral_constant<unsigned, 2>,
-            typename traits::device_type::execution_space>::type>;
+        std::conditional_t<alloc_prop_input::has_label,
+                           std::integral_constant<unsigned, 0>, std::string>,
+        std::conditional_t<alloc_prop_input::has_memory_space,
+                           std::integral_constant<unsigned, 1>,
+                           typename traits::device_type::memory_space>,
+        std::conditional_t<alloc_prop_input::has_execution_space,
+                           std::integral_constant<unsigned, 2>,
+                           typename traits::device_type::execution_space>>;
 
     static_assert(traits::is_managed,
                   "View allocation constructor requires managed memory");
@@ -1152,7 +1148,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
         prop_copy,
         Impl::DynRankDimTraits<typename traits::specialize>::
-            template createLayout<traits, P...>(arg_prop, arg_layout));
+            template createLayout<traits, P...>(arg_prop, arg_layout),
+        Impl::ViewCtorProp<P...>::has_execution_space);
 
 //------------------------------------------------------------
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -1172,9 +1169,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit KOKKOS_INLINE_FUNCTION DynRankView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout)
+      std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout)
       : m_track()  // No memory tracking
         ,
         m_map(arg_prop,
@@ -1197,15 +1193,15 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit inline DynRankView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              size_t>::type const arg_N0 = KOKKOS_INVALID_INDEX,
-      const size_t arg_N1                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N2                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N3                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N4                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N5                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N6                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N7                                = KOKKOS_INVALID_INDEX)
+      std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       size_t> const arg_N0 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N1                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N2                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N3                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N4                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N5                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N6                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N7                   = KOKKOS_INVALID_INDEX)
       : DynRankView(arg_prop, typename traits::array_layout(
                                   arg_N0, arg_N1, arg_N2, arg_N3, arg_N4,
                                   arg_N5, arg_N6, arg_N7)) {}
@@ -1213,15 +1209,15 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit KOKKOS_INLINE_FUNCTION DynRankView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              size_t>::type const arg_N0 = KOKKOS_INVALID_INDEX,
-      const size_t arg_N1                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N2                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N3                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N4                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N5                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N6                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N7                                = KOKKOS_INVALID_INDEX)
+      std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       size_t> const arg_N0 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N1                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N2                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N3                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N4                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N5                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N6                   = KOKKOS_INVALID_INDEX,
+      const size_t arg_N7                   = KOKKOS_INVALID_INDEX)
       : DynRankView(arg_prop, typename traits::array_layout(
                                   arg_N0, arg_N1, arg_N2, arg_N3, arg_N4,
                                   arg_N5, arg_N6, arg_N7)) {}
@@ -1230,9 +1226,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <typename Label>
   explicit inline DynRankView(
       const Label& arg_label,
-      typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value,
-                              typename traits::array_layout>::type const&
-          arg_layout)
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value,
+                       typename traits::array_layout> const& arg_layout)
       : DynRankView(Kokkos::Impl::ViewCtorProp<std::string>(arg_label),
                     arg_layout) {}
 
@@ -1240,15 +1235,15 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   template <typename Label>
   explicit inline DynRankView(
       const Label& arg_label,
-      typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value,
-                              const size_t>::type arg_N0 = KOKKOS_INVALID_INDEX,
-      const size_t arg_N1                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N2                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N3                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N4                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N5                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N6                                = KOKKOS_INVALID_INDEX,
-      const size_t arg_N7                                = KOKKOS_INVALID_INDEX)
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, const size_t>
+          arg_N0          = KOKKOS_INVALID_INDEX,
+      const size_t arg_N1 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N2 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N3 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N4 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N5 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N6 = KOKKOS_INVALID_INDEX,
+      const size_t arg_N7 = KOKKOS_INVALID_INDEX)
       : DynRankView(
             Kokkos::Impl::ViewCtorProp<std::string>(arg_label),
             typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
@@ -1298,7 +1293,7 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
         (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) +
         (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX);
 
-    if (std::is_same<typename traits::specialize, void>::value &&
+    if (std::is_void<typename traits::specialize>::value &&
         num_passed_args != traits::rank_dynamic) {
       Kokkos::abort(
           "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n");
@@ -1365,15 +1360,14 @@ namespace Impl {
 
 template <class SrcTraits, class... Args>
 class ViewMapping<
-    typename std::enable_if<
-        (std::is_same<typename SrcTraits::specialize, void>::value &&
-         (std::is_same<typename SrcTraits::array_layout,
-                       Kokkos::LayoutLeft>::value ||
-          std::is_same<typename SrcTraits::array_layout,
-                       Kokkos::LayoutRight>::value ||
-          std::is_same<typename SrcTraits::array_layout,
-                       Kokkos::LayoutStride>::value)),
-        Kokkos::Impl::DynRankSubviewTag>::type,
+    std::enable_if_t<(std::is_void<typename SrcTraits::specialize>::value &&
+                      (std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutLeft>::value ||
+                       std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutRight>::value ||
+                       std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutStride>::value)),
+                     Kokkos::Impl::DynRankSubviewTag>,
     SrcTraits, Args...> {
  private:
   enum {
@@ -1445,22 +1439,21 @@ class ViewMapping<
       Args... args) {
     using DstType = ViewMapping<traits_type, typename traits_type::specialize>;
 
-    using DstDimType = typename std::conditional<
+    using DstDimType = std::conditional_t<
         (rank == 0), ViewDimension<>,
-        typename std::conditional<
+        std::conditional_t<
             (rank == 1), ViewDimension<0>,
-            typename std::conditional<
+            std::conditional_t<
                 (rank == 2), ViewDimension<0, 0>,
-                typename std::conditional<
+                std::conditional_t<
                     (rank == 3), ViewDimension<0, 0, 0>,
-                    typename std::conditional<
+                    std::conditional_t<
                         (rank == 4), ViewDimension<0, 0, 0, 0>,
-                        typename std::conditional<
+                        std::conditional_t<
                             (rank == 5), ViewDimension<0, 0, 0, 0, 0>,
-                            typename std::conditional<
+                            std::conditional_t<
                                 (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>,
-                                ViewDimension<0, 0, 0, 0, 0, 0, 0> >::type>::
-                            type>::type>::type>::type>::type>::type;
+                                ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>;
 
     using dst_offset_type = ViewOffset<DstDimType, Kokkos::LayoutStride>;
     using dst_handle_type = typename DstType::handle_type;
@@ -1621,8 +1614,7 @@ struct DynRankViewFill {
 };
 
 template <class OutputView>
-struct DynRankViewFill<OutputView,
-                       typename std::enable_if<OutputView::Rank == 0>::type> {
+struct DynRankViewFill<OutputView, std::enable_if_t<OutputView::Rank == 0>> {
   DynRankViewFill(const OutputView& dst,
                   const typename OutputView::const_value_type& src) {
     Kokkos::Impl::DeepCopy<typename OutputView::memory_space,
@@ -1645,6 +1637,24 @@ struct DynRankViewRemap {
   const size_t n6;
   const size_t n7;
 
+  DynRankViewRemap(const ExecSpace& exec_space, const OutputView& arg_out,
+                   const InputView& arg_in)
+      : output(arg_out),
+        input(arg_in),
+        n0(std::min((size_t)arg_out.extent(0), (size_t)arg_in.extent(0))),
+        n1(std::min((size_t)arg_out.extent(1), (size_t)arg_in.extent(1))),
+        n2(std::min((size_t)arg_out.extent(2), (size_t)arg_in.extent(2))),
+        n3(std::min((size_t)arg_out.extent(3), (size_t)arg_in.extent(3))),
+        n4(std::min((size_t)arg_out.extent(4), (size_t)arg_in.extent(4))),
+        n5(std::min((size_t)arg_out.extent(5), (size_t)arg_in.extent(5))),
+        n6(std::min((size_t)arg_out.extent(6), (size_t)arg_in.extent(6))),
+        n7(std::min((size_t)arg_out.extent(7), (size_t)arg_in.extent(7))) {
+    using Policy = Kokkos::RangePolicy<ExecSpace>;
+
+    Kokkos::parallel_for("Kokkos::DynRankViewRemap", Policy(exec_space, 0, n0),
+                         *this);
+  }
+
   DynRankViewRemap(const OutputView& arg_out, const InputView& arg_in)
       : output(arg_out),
         input(arg_in),
@@ -1691,14 +1701,19 @@ namespace Impl {
    underlying memory, to facilitate implementation of deep_copy() and
    other routines that are defined on View */
 template <unsigned N, typename T, typename... Args>
-auto as_view_of_rank_n(DynRankView<T, Args...> v) {
+KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
   if (v.rank() != N) {
-    Kokkos::Impl::throw_runtime_exception(
-        "Converting DynRankView of rank " + std::to_string(v.rank()) +
-        " to a View of mis-matched rank " + std::to_string(N));
+    KOKKOS_IF_ON_HOST(
+        const std::string message =
+            "Converting DynRankView of rank " + std::to_string(v.rank()) +
+            " to a View of mis-matched rank " + std::to_string(N) + "!";
+        Kokkos::abort(message.c_str());)
+    KOKKOS_IF_ON_DEVICE(
+        Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");)
   }
 
-  return View<typename RankDataType<T, N>::type, Args...>(v.data(), v.layout());
+  return View<typename RankDataType<T, N>::type, Args...>(
+      v.data(), v.impl_map().layout());
 }
 
 template <typename Function, typename... Args>
@@ -1713,22 +1728,54 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView<Args...> a) {
     case 6: f(as_view_of_rank_n<6>(a)); break;
     case 7: f(as_view_of_rank_n<7>(a)); break;
     default:
-      Kokkos::Impl::throw_runtime_exception(
-          "Trying to apply a function to a view of unexpected rank " +
-          std::to_string(rank(a)));
+      KOKKOS_IF_ON_HOST(
+          Kokkos::abort(
+              std::string(
+                  "Trying to apply a function to a view of unexpected rank " +
+                  std::to_string(rank(a)))
+                  .c_str());)
+      KOKKOS_IF_ON_DEVICE(
+          Kokkos::abort(
+              "Trying to apply a function to a view of unexpected rank");)
   }
 }
 
 }  // namespace Impl
 
+template <typename D, class... P>
+KOKKOS_INLINE_FUNCTION constexpr auto DynRankView<D, P...>::layout() const ->
+    typename traits::array_layout {
+  switch (rank()) {
+    case 0: return Impl::as_view_of_rank_n<0>(*this).layout();
+    case 1: return Impl::as_view_of_rank_n<1>(*this).layout();
+    case 2: return Impl::as_view_of_rank_n<2>(*this).layout();
+    case 3: return Impl::as_view_of_rank_n<3>(*this).layout();
+    case 4: return Impl::as_view_of_rank_n<4>(*this).layout();
+    case 5: return Impl::as_view_of_rank_n<5>(*this).layout();
+    case 6: return Impl::as_view_of_rank_n<6>(*this).layout();
+    case 7: return Impl::as_view_of_rank_n<7>(*this).layout();
+    default:
+      KOKKOS_IF_ON_HOST(
+          Kokkos::abort(
+              std::string(
+                  "Calling DynRankView::layout on DRV of unexpected rank " +
+                  std::to_string(rank()))
+                  .c_str());)
+      KOKKOS_IF_ON_DEVICE(
+          Kokkos::abort(
+              "Calling DynRankView::layout on DRV of unexpected rank");)
+  }
+  // control flow should never reach here
+  return m_map.layout();
+}
+
 /** \brief  Deep copy a value from Host memory into a view.  */
 template <class ExecSpace, class DT, class... DP>
 inline void deep_copy(
     const ExecSpace& e, const DynRankView<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   static_assert(
       std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type,
                    typename ViewTraits<DT, DP...>::value_type>::value,
@@ -1742,9 +1789,8 @@ template <class DT, class... DP>
 inline void deep_copy(
     const DynRankView<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); },
                                      dst);
 }
@@ -1755,8 +1801,8 @@ inline void deep_copy(
     const ExecSpace& e,
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const DynRankView<ST, SP...>& src,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<ST, SP...>::specialize, void>::value>::type* = 0) {
+    std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize,
+                                  void>::value>* = 0) {
   deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src));
 }
 
@@ -1764,8 +1810,8 @@ template <class ST, class... SP>
 inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const DynRankView<ST, SP...>& src,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<ST, SP...>::specialize, void>::value>::type* = 0) {
+    std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize,
+                                  void>::value>* = 0) {
   deep_copy(dst, Impl::as_view_of_rank_n<0>(src));
 }
 
@@ -1778,11 +1824,11 @@ inline void deep_copy(
 template <class ExecSpace, class DstType, class SrcType>
 inline void deep_copy(
     const ExecSpace& exec_space, const DstType& dst, const SrcType& src,
-    typename std::enable_if<
-        (std::is_same<typename DstType::traits::specialize, void>::value &&
-         std::is_same<typename SrcType::traits::specialize, void>::value &&
+    std::enable_if_t<
+        (std::is_void<typename DstType::traits::specialize>::value &&
+         std::is_void<typename SrcType::traits::specialize>::value &&
          (Kokkos::is_dyn_rank_view<DstType>::value ||
-          Kokkos::is_dyn_rank_view<SrcType>::value))>::type* = nullptr) {
+          Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) {
   static_assert(
       std::is_same<typename DstType::traits::value_type,
                    typename DstType::traits::non_const_value_type>::value,
@@ -1831,11 +1877,11 @@ inline void deep_copy(
 template <class DstType, class SrcType>
 inline void deep_copy(
     const DstType& dst, const SrcType& src,
-    typename std::enable_if<
-        (std::is_same<typename DstType::traits::specialize, void>::value &&
-         std::is_same<typename SrcType::traits::specialize, void>::value &&
+    std::enable_if_t<
+        (std::is_void<typename DstType::traits::specialize>::value &&
+         std::is_void<typename SrcType::traits::specialize>::value &&
          (Kokkos::is_dyn_rank_view<DstType>::value ||
-          Kokkos::is_dyn_rank_view<SrcType>::value))>::type* = nullptr) {
+          Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) {
   static_assert(
       std::is_same<typename DstType::traits::value_type,
                    typename DstType::traits::non_const_value_type>::value,
@@ -1910,8 +1956,8 @@ struct MirrorDRViewType {
   using dest_view_type = Kokkos::DynRankView<data_type, array_layout, Space>;
   // If it is the same memory_space return the existsing view_type
   // This will also keep the unmanaged trait if necessary
-  using view_type = typename std::conditional<is_same_memspace, src_view_type,
-                                              dest_view_type>::type;
+  using view_type =
+      std::conditional_t<is_same_memspace, src_view_type, dest_view_type>;
 };
 
 template <class Space, class T, class... P>
@@ -1936,124 +1982,352 @@ struct MirrorDRVType {
 
 }  // namespace Impl
 
-template <class T, class... P>
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
 inline typename DynRankView<T, P...>::HostMirror create_mirror(
     const DynRankView<T, P...>& src,
-    typename std::enable_if<
-        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
-        !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                      Kokkos::LayoutStride>::value>::type* = nullptr) {
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* =
+        nullptr) {
   using src_type = DynRankView<T, P...>;
   using dst_type = typename src_type::HostMirror;
 
-  return dst_type(std::string(src.label()).append("_mirror"),
-                  Impl::reconstructLayout(src.layout(), src.rank()));
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank()));
 }
 
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const DynRankView<T, P...>& src,
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    std::enable_if_t<Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* =
+        nullptr) {
+  using dst_type = typename Impl::MirrorDRVType<
+      typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+      P...>::view_type;
+
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank()));
+}
+
+}  // namespace Impl
+
+// Create a mirror in host space
 template <class T, class... P>
 inline typename DynRankView<T, P...>::HostMirror create_mirror(
     const DynRankView<T, P...>& src,
-    typename std::enable_if<
-        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
-        std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                     Kokkos::LayoutStride>::value>::type* = 0) {
-  using src_type = DynRankView<T, P...>;
-  using dst_type = typename src_type::HostMirror;
+    std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize,
+                                  void>::value>* = nullptr) {
+  return Impl::create_mirror(src, Kokkos::Impl::ViewCtorProp<>{});
+}
 
-  return dst_type(std::string(src.label()).append("_mirror"),
-                  Impl::reconstructLayout(src.layout(), src.rank()));
+template <class T, class... P>
+inline typename DynRankView<T, P...>::HostMirror create_mirror(
+    Kokkos::Impl::WithoutInitializing_t wi, const DynRankView<T, P...>& src,
+    std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize,
+                                  void>::value>* = nullptr) {
+  return Impl::create_mirror(src, Kokkos::view_alloc(wi));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline typename DynRankView<T, P...>::HostMirror create_mirror(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const DynRankView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = nullptr) {
+  return Impl::create_mirror(src, arg_prop);
+}
+
+// Create a mirror in a new space
+template <class Space, class T, class... P,
+          typename Enable = std::enable_if_t<
+              Kokkos::is_space<Space>::value &&
+              std::is_void<typename ViewTraits<T, P...>::specialize>::value>>
+typename Impl::MirrorDRVType<Space, T, P...>::view_type create_mirror(
+    const Space&, const Kokkos::DynRankView<T, P...>& src) {
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 
-// Create a mirror in a new space (specialization for different space)
 template <class Space, class T, class... P>
 typename Impl::MirrorDRVType<Space, T, P...>::view_type create_mirror(
+    Kokkos::Impl::WithoutInitializing_t wi, const Space&,
+    const Kokkos::DynRankView<T, P...>& src,
+    std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize,
+                                  void>::value>* = nullptr) {
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(wi, typename Space::memory_space{}));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const DynRankView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = nullptr) {
+  using ReturnType = typename Impl::MirrorDRVType<
+      typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+      P...>::view_type;
+  return ReturnType{Impl::create_mirror(src, arg_prop)};
+}
+
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    std::is_same<
+        typename DynRankView<T, P...>::memory_space,
+        typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+        std::is_same<
+            typename DynRankView<T, P...>::data_type,
+            typename DynRankView<T, P...>::HostMirror::data_type>::value,
+    typename DynRankView<T, P...>::HostMirror>
+create_mirror_view(const DynRankView<T, P...>& src,
+                   const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
+  return src;
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    !(std::is_same<
+          typename DynRankView<T, P...>::memory_space,
+          typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+      std::is_same<
+          typename DynRankView<T, P...>::data_type,
+          typename DynRankView<T, P...>::HostMirror::data_type>::value),
+    typename DynRankView<T, P...>::HostMirror>
+create_mirror_view(
+    const DynRankView<T, P...>& src,
+    const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::Impl::create_mirror(src, arg_prop);
+}
+
+template <class Space, class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    Kokkos::is_space<Space>::value &&
+        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
+                   const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
+  return src;
+}
+
+template <class Space, class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    Kokkos::is_space<Space>::value &&
+        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+create_mirror_view(
     const Space&, const Kokkos::DynRankView<T, P...>& src,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<T, P...>::specialize, void>::value>::type* =
-        nullptr) {
-  return typename Impl::MirrorDRVType<Space, T, P...>::view_type(
-      src.label(), Impl::reconstructLayout(src.layout(), src.rank()));
+    const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using MemorySpace = typename Space::memory_space;
+  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
+  alloc_prop prop_copy(arg_prop);
+
+  return Kokkos::Impl::create_mirror(src, prop_copy);
 }
+}  // namespace Impl
 
+// Create a mirror view in host space
 template <class T, class... P>
-inline typename DynRankView<T, P...>::HostMirror create_mirror_view(
-    const DynRankView<T, P...>& src,
-    typename std::enable_if<
-        (std::is_same<
-             typename DynRankView<T, P...>::memory_space,
-             typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
-         std::is_same<typename DynRankView<T, P...>::data_type,
-                      typename DynRankView<T, P...>::HostMirror::data_type>::
-             value)>::type* = nullptr) {
+inline std::enable_if_t<
+    (std::is_same<
+         typename DynRankView<T, P...>::memory_space,
+         typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+     std::is_same<typename DynRankView<T, P...>::data_type,
+                  typename DynRankView<T, P...>::HostMirror::data_type>::value),
+    typename DynRankView<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::DynRankView<T, P...>& src) {
   return src;
 }
 
 template <class T, class... P>
-inline typename DynRankView<T, P...>::HostMirror create_mirror_view(
-    const DynRankView<T, P...>& src,
-    typename std::enable_if<
-        !(std::is_same<
-              typename DynRankView<T, P...>::memory_space,
-              typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
-          std::is_same<typename DynRankView<T, P...>::data_type,
-                       typename DynRankView<T, P...>::HostMirror::data_type>::
-              value)>::type* = nullptr) {
+inline std::enable_if_t<
+    !(std::is_same<
+          typename DynRankView<T, P...>::memory_space,
+          typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+      std::is_same<
+          typename DynRankView<T, P...>::data_type,
+          typename DynRankView<T, P...>::HostMirror::data_type>::value),
+    typename DynRankView<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::DynRankView<T, P...>& src) {
   return Kokkos::create_mirror(src);
 }
 
-// Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P>
-typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view(
+template <class T, class... P>
+inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
+                               const DynRankView<T, P...>& src) {
+  return Impl::create_mirror_view(src, Kokkos::view_alloc(wi));
+}
+
+// Create a mirror view in a new space
+// FIXME_C++17 Improve SFINAE here.
+template <class Space, class T, class... P,
+          class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+inline typename Impl::MirrorDRViewType<Space, T, P...>::view_type
+create_mirror_view(
     const Space&, const Kokkos::DynRankView<T, P...>& src,
-    typename std::enable_if<
-        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
-        nullptr) {
+    std::enable_if_t<
+        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>* = nullptr) {
   return src;
 }
 
-// Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P>
-typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
-    typename std::enable_if<
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
-        nullptr) {
-  return typename Impl::MirrorDRViewType<Space, T, P...>::view_type(
-      src.label(), Impl::reconstructLayout(src.layout(), src.rank()));
+// FIXME_C++17 Improve SFINAE here.
+template <class Space, class T, class... P,
+          class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+inline typename Impl::MirrorDRViewType<Space, T, P...>::view_type
+create_mirror_view(
+    const Space& space, const Kokkos::DynRankView<T, P...>& src,
+    std::enable_if_t<
+        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>* = nullptr) {
+  return Kokkos::create_mirror(space, src);
 }
 
-// Create a mirror view and deep_copy in a new space (specialization for same
-// space)
 template <class Space, class T, class... P>
-typename Impl::MirrorDRViewType<Space, T, P...>::view_type
-create_mirror_view_and_copy(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
-    std::string const& name = "",
-    typename std::enable_if<
-        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
-        nullptr) {
-  (void)name;
+inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
+                               const Space& space,
+                               const Kokkos::DynRankView<T, P...>& src) {
+  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror_view(
+    const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::DynRankView<T, P...>& src) {
+  return Impl::create_mirror_view(src, arg_prop);
+}
+
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>&,
+    const Kokkos::DynRankView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+
+  // same behavior as deep_copy(src, src)
+  if (!alloc_prop_input::has_execution_space)
+    fence(
+        "Kokkos::create_mirror_view_and_copy: fence before returning src view");
   return src;
 }
 
-// Create a mirror view and deep_copy in a new space (specialization for
-// different space)
-template <class Space, class T, class... P>
-typename Impl::MirrorDRViewType<Space, T, P...>::view_type
-create_mirror_view_and_copy(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
-    std::string const& name = "",
-    typename std::enable_if<
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
-        nullptr) {
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::DynRankView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        !Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+  using Space  = typename alloc_prop_input::memory_space;
   using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type;
-  std::string label = name.empty() ? src.label() : name;
-  auto mirror       = Mirror(view_alloc(WithoutInitializing, label),
-                       Impl::reconstructLayout(src.layout(), src.rank()));
-  deep_copy(mirror, src);
+
+  // Add some properties if not provided to avoid need for if constexpr
+  using alloc_prop = Impl::ViewCtorProp<
+      ViewCtorArgs...,
+      std::conditional_t<alloc_prop_input::has_label,
+                         std::integral_constant<unsigned int, 12>, std::string>,
+      std::conditional_t<!alloc_prop_input::initialize,
+                         std::integral_constant<unsigned int, 13>,
+                         Impl::WithoutInitializing_t>,
+      std::conditional_t<alloc_prop_input::has_execution_space,
+                         std::integral_constant<unsigned int, 14>,
+                         typename Space::execution_space>>;
+  alloc_prop arg_prop_copy(arg_prop);
+
+  std::string& label =
+      static_cast<Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy).value;
+  if (label.empty()) label = src.label();
+  auto mirror = typename Mirror::non_const_type{
+      arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())};
+  if (alloc_prop_input::has_execution_space) {
+    using ExecutionSpace = typename alloc_prop::execution_space;
+    deep_copy(
+        static_cast<Impl::ViewCtorProp<void, ExecutionSpace>&>(arg_prop_copy)
+            .value,
+        mirror, src);
+  } else
+    deep_copy(mirror, src);
   return mirror;
 }
 
+template <class Space, class T, class... P>
+auto create_mirror_view_and_copy(const Space&,
+                                 const Kokkos::DynRankView<T, P...>& src,
+                                 std::string const& name = "") {
+  return create_mirror_view_and_copy(
+      Kokkos::view_alloc(typename Space::memory_space{}, name), src);
+}
+
 }  // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -2062,20 +2336,47 @@ create_mirror_view_and_copy(
 namespace Kokkos {
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
-template <class... I, class T, class... P>
-inline void impl_resize(DynRankView<T, P...>& v, const size_t n0,
+template <class... ViewCtorArgs, class T, class... P>
+inline void impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+                        DynRankView<T, P...>& v, const size_t n0,
                         const size_t n1, const size_t n2, const size_t n3,
                         const size_t n4, const size_t n5, const size_t n6,
-                        const size_t n7, const I&... arg_prop) {
-  using drview_type = DynRankView<T, P...>;
+                        const size_t n7) {
+  using drview_type      = DynRankView<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
-
-  drview_type v_resized(view_alloc(v.label(), arg_prop...), n0, n1, n2, n3, n4,
-                        n5, n6, n7);
-
-  Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(v_resized, v);
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::resize "
+                "must not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a memory space instance!");
+
+  // Add execution space here to avoid the need for if constexpr below
+  using alloc_prop = Impl::ViewCtorProp<
+      ViewCtorArgs..., std::string,
+      std::conditional_t<alloc_prop_input::has_execution_space,
+                         std::integral_constant<unsigned int, 10>,
+                         typename drview_type::execution_space>>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      v.label();
+
+  drview_type v_resized(prop_copy, n0, n1, n2, n3, n4, n5, n6, n7);
+
+  if (alloc_prop_input::has_execution_space)
+    Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(
+        static_cast<const Impl::ViewCtorProp<
+            void, typename alloc_prop::execution_space>&>(prop_copy)
+            .value,
+        v_resized, v);
+  else
+    Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(v_resized, v);
 
   v = v_resized;
 }
@@ -2090,7 +2391,21 @@ inline void resize(DynRankView<T, P...>& v,
                    const size_t n5 = KOKKOS_INVALID_INDEX,
                    const size_t n6 = KOKKOS_INVALID_INDEX,
                    const size_t n7 = KOKKOS_INVALID_INDEX) {
-  impl_resize(v, n0, n1, n2, n3, n4, n5, n6, n7);
+  impl_resize(Impl::ViewCtorProp<>{}, v, n0, n1, n2, n3, n4, n5, n6, n7);
+}
+
+template <class... ViewCtorArgs, class T, class... P>
+void resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+            DynRankView<T, P...>& v,
+            const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+            const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+  impl_resize(arg_prop, v, n0, n1, n2, n3, n4, n5, n6, n7);
 }
 
 template <class I, class T, class... P>
@@ -2104,26 +2419,53 @@ inline std::enable_if_t<Impl::is_view_ctor_property<I>::value> resize(
     const size_t n5 = KOKKOS_INVALID_INDEX,
     const size_t n6 = KOKKOS_INVALID_INDEX,
     const size_t n7 = KOKKOS_INVALID_INDEX) {
-  impl_resize(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+  impl_resize(Kokkos::view_alloc(arg_prop), v, n0, n1, n2, n3, n4, n5, n6, n7);
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
-template <class... I, class T, class... P>
+template <class... ViewCtorArgs, class T, class... P>
 inline void impl_realloc(DynRankView<T, P...>& v, const size_t n0,
                          const size_t n1, const size_t n2, const size_t n3,
                          const size_t n4, const size_t n5, const size_t n6,
-                         const size_t n7, const I&... arg_prop) {
-  using drview_type = DynRankView<T, P...>;
+                         const size_t n7,
+                         const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using drview_type      = DynRankView<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
-
-  const std::string label = v.label();
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a memory space instance!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop arg_prop_copy(arg_prop);
+  static_cast<Kokkos::Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy)
+      .value = v.label();
 
   v = drview_type();  // Deallocate first, if the only view to allocation
-  v = drview_type(view_alloc(label, arg_prop...), n0, n1, n2, n3, n4, n5, n6,
-                  n7);
+  v = drview_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7);
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline void realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+                    DynRankView<T, P...>& v,
+                    const size_t n0 = KOKKOS_INVALID_INDEX,
+                    const size_t n1 = KOKKOS_INVALID_INDEX,
+                    const size_t n2 = KOKKOS_INVALID_INDEX,
+                    const size_t n3 = KOKKOS_INVALID_INDEX,
+                    const size_t n4 = KOKKOS_INVALID_INDEX,
+                    const size_t n5 = KOKKOS_INVALID_INDEX,
+                    const size_t n6 = KOKKOS_INVALID_INDEX,
+                    const size_t n7 = KOKKOS_INVALID_INDEX) {
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
 }
 
 template <class T, class... P>
@@ -2136,7 +2478,7 @@ inline void realloc(DynRankView<T, P...>& v,
                     const size_t n5 = KOKKOS_INVALID_INDEX,
                     const size_t n6 = KOKKOS_INVALID_INDEX,
                     const size_t n7 = KOKKOS_INVALID_INDEX) {
-  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7);
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, Impl::ViewCtorProp<>{});
 }
 
 template <class I, class T, class... P>
@@ -2150,9 +2492,13 @@ inline std::enable_if_t<Impl::is_view_ctor_property<I>::value> realloc(
     const size_t n5 = KOKKOS_INVALID_INDEX,
     const size_t n6 = KOKKOS_INVALID_INDEX,
     const size_t n7 = KOKKOS_INVALID_INDEX) {
-  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, Kokkos::view_alloc(arg_prop));
 }
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNRANKVIEW
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNRANKVIEW
+#endif
 #endif
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index 91904d7cc..015a75cb0 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_DYNAMIC_VIEW_HPP
 #define KOKKOS_DYNAMIC_VIEW_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNAMICVIEW
+#endif
 
 #include <cstdio>
 
@@ -118,8 +122,7 @@ struct ChunkedArrayManager {
   template <typename Space>
   static ChunkedArrayManager<Space, ValueType> create_mirror(
       ChunkedArrayManager<MemorySpace, ValueType> const& other,
-      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
-          nullptr) {
+      std::enable_if_t<IsAccessibleFrom<Space>::value>* = nullptr) {
     return ChunkedArrayManager<Space, ValueType>{
         ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
   }
@@ -127,8 +130,7 @@ struct ChunkedArrayManager {
   template <typename Space>
   static ChunkedArrayManager<Space, ValueType> create_mirror(
       ChunkedArrayManager<MemorySpace, ValueType> const& other,
-      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
-          nullptr) {
+      std::enable_if_t<!IsAccessibleFrom<Space>::value>* = nullptr) {
     using tag_type =
         typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
     return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
@@ -217,17 +219,15 @@ struct ChunkedArrayManager {
 
   pointer_type* get_ptr() const { return m_chunks; }
 
-  template <typename Space>
-  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
-      ChunkedArrayManager<Space, ValueType> const& other) {
-    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
-        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
-  }
-
-  template <typename Space>
-  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
-      ChunkedArrayManager<Space, ValueType> const&) {
-    // no-op
+  template <typename OtherMemorySpace, typename ExecutionSpace>
+  void deep_copy_to(
+      const ExecutionSpace& exec_space,
+      ChunkedArrayManager<OtherMemorySpace, ValueType> const& other) const {
+    if (other.m_chunks != m_chunks) {
+      Kokkos::Impl::DeepCopy<OtherMemorySpace, MemorySpace, ExecutionSpace>(
+          exec_space, other.m_chunks, m_chunks,
+          sizeof(pointer_type) * (m_chunk_max + 2));
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -278,7 +278,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   // It is assumed that the value_type is trivially copyable;
   // when this is not the case, potential problems can occur.
-  static_assert(std::is_same<typename traits::specialize, void>::value,
+  static_assert(std::is_void<typename traits::specialize>::value,
                 "DynamicView only implemented for non-specialized View type");
 
  private:
@@ -339,6 +339,9 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   KOKKOS_INLINE_FUNCTION
   size_t chunk_size() const noexcept { return m_chunk_size; }
 
+  KOKKOS_INLINE_FUNCTION
+  size_t chunk_max() const noexcept { return m_chunk_max; }
+
   KOKKOS_INLINE_FUNCTION
   size_t size() const noexcept {
     size_t extent_0 =
@@ -411,33 +414,14 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
         "space");
 
     // Which chunk is being indexed.
-    const uintptr_t ic = uintptr_t(i0 >> m_chunk_shift);
-
-    typename traits::value_type* volatile* const ch = m_chunks + ic;
+    const uintptr_t ic = uintptr_t(i0) >> m_chunk_shift;
 
-    // Do bounds checking if enabled or if the chunk pointer is zero.
-    // If not bounds checking then we assume a non-zero pointer is valid.
-
-#if !defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-    if (nullptr == *ch)
+#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
+    const uintptr_t n = *reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+    if (n <= ic) Kokkos::abort("Kokkos::DynamicView array bounds error");
 #endif
-    {
-      // Verify that allocation of the requested chunk in in progress.
-
-      // The allocated chunk counter is m_chunks[ m_chunk_max ]
-      const uintptr_t n =
-          *reinterpret_cast<uintptr_t volatile*>(m_chunks + m_chunk_max);
-
-      if (n <= ic) {
-        Kokkos::abort("Kokkos::DynamicView array bounds error");
-      }
-
-      // Allocation of this chunk is in progress
-      // so wait for allocation to complete.
-      while (nullptr == *ch)
-        ;
-    }
 
+    typename traits::value_type** const ch = m_chunks + ic;
     return (*ch)[i0 & m_chunk_mask];
   }
 
@@ -481,7 +465,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
     *(pc + 1) = n;
 
-    m_chunks_host.deep_copy_to(m_chunks);
+    typename device_space::execution_space exec{};
+    m_chunks_host.deep_copy_to(exec, m_chunks);
+    exec.fence(
+        "DynamicView::resize_serial: Fence after copying chunks to the device");
   }
 
   KOKKOS_INLINE_FUNCTION bool is_allocated() const {
@@ -496,6 +483,12 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     }
   }
 
+  KOKKOS_FUNCTION const device_accessor& impl_get_chunks() const {
+    return m_chunks;
+  }
+
+  KOKKOS_FUNCTION device_accessor& impl_get_chunks() { return m_chunks; }
+
   //----------------------------------------------------------------------
 
   ~DynamicView()                  = default;
@@ -525,9 +518,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *  A maximum size is required in order to allocate a
    *  chunk-pointer array.
    */
-  explicit inline DynamicView(const std::string& arg_label,
-                              const unsigned min_chunk_size,
-                              const unsigned max_extent)
+  template <class... Prop>
+  DynamicView(const Kokkos::Impl::ViewCtorProp<Prop...>& arg_prop,
+              const unsigned min_chunk_size,
+              const unsigned max_extent)
       :  // The chunk size is guaranteed to be a power of two
         m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
             min_chunk_size))  // div ceil(log2(min_chunk_size))
@@ -540,33 +534,336 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
         m_chunk_size(2 << (m_chunk_shift - 1)) {
     m_chunks = device_accessor(m_chunk_max, m_chunk_size);
 
+    const std::string& label =
+        static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const&>(
+            arg_prop)
+            .value;
+
     if (device_accessor::template IsAccessibleFrom<host_space>::value) {
-      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.template allocate_with_destroy<device_space>(label);
       m_chunks.initialize();
       m_chunks_host =
           device_accessor::template create_mirror<host_space>(m_chunks);
     } else {
-      m_chunks.allocate_device(arg_label);
+      m_chunks.allocate_device(label);
       m_chunks_host =
           device_accessor::template create_mirror<host_space>(m_chunks);
       m_chunks_host.template allocate_with_destroy<device_space>(
-          arg_label, m_chunks.get_ptr());
+          label, m_chunks.get_ptr());
       m_chunks_host.initialize();
-      m_chunks_host.deep_copy_to(m_chunks);
+
+      // Add some properties if not provided to avoid need for if constexpr
+      using alloc_prop_input = Kokkos::Impl::ViewCtorProp<Prop...>;
+      using alloc_prop       = Kokkos::Impl::ViewCtorProp<
+          Prop..., std::conditional_t<alloc_prop_input::has_execution_space,
+                                      std::integral_constant<unsigned int, 15>,
+                                      typename device_space::execution_space>>;
+      alloc_prop arg_prop_copy(arg_prop);
+
+      const auto& exec = static_cast<const Kokkos::Impl::ViewCtorProp<
+          void, typename alloc_prop::execution_space>&>(arg_prop_copy)
+                             .value;
+      m_chunks_host.deep_copy_to(exec, m_chunks);
+      if (!alloc_prop_input::has_execution_space)
+        exec.fence(
+            "DynamicView::DynamicView(): Fence after copying chunks to the "
+            "device");
     }
   }
+
+  DynamicView(const std::string& arg_label, const unsigned min_chunk_size,
+              const unsigned max_extent)
+      : DynamicView(Kokkos::view_alloc(arg_label), min_chunk_size, max_extent) {
+  }
 };
 
 }  // namespace Experimental
+
+template <class>
+struct is_dynamic_view : public std::false_type {};
+
+template <class D, class... P>
+struct is_dynamic_view<Kokkos::Experimental::DynamicView<D, P...>>
+    : public std::true_type {};
+
 }  // namespace Kokkos
 
 namespace Kokkos {
 
+namespace Impl {
+
+// Deduce Mirror Types
+template <class Space, class T, class... P>
+struct MirrorDynamicViewType {
+  // The incoming view_type
+  using src_view_type = typename Kokkos::Experimental::DynamicView<T, P...>;
+  // The memory space for the mirror view
+  using memory_space = typename Space::memory_space;
+  // Check whether it is the same memory space
+  enum {
+    is_same_memspace =
+        std::is_same<memory_space, typename src_view_type::memory_space>::value
+  };
+  // The array_layout
+  using array_layout = typename src_view_type::array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even
+  // deep_copy to it.)
+  using data_type = typename src_view_type::non_const_data_type;
+  // The destination view type if it is not the same memory space
+  using dest_view_type =
+      Kokkos::Experimental::DynamicView<data_type, array_layout, Space>;
+  // If it is the same memory_space return the existing view_type
+  // This will also keep the unmanaged trait if necessary
+  using view_type =
+      std::conditional_t<is_same_memspace, src_view_type, dest_view_type>;
+};
+}  // namespace Impl
+
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const Kokkos::Experimental::DynamicView<T, P...>& src,
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* =
+        nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  auto ret = typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror(
+      prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size());
+
+  ret.resize_serial(src.extent(0));
+
+  return ret;
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const Kokkos::Experimental::DynamicView<T, P...>& src,
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    std::enable_if_t<Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* =
+        nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using MemorySpace = typename alloc_prop_input::memory_space;
+  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  auto ret = typename Kokkos::Impl::MirrorDynamicViewType<
+      MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(),
+                                       src.chunk_max() * src.chunk_size());
+
+  ret.resize_serial(src.extent(0));
+
+  return ret;
+}
+}  // namespace Impl
+
+// Create a mirror in host space
+template <class T, class... P>
+inline auto create_mirror(
+    const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror(src, Impl::ViewCtorProp<>{});
+}
+
 template <class T, class... P>
-inline typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror
-create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src) {
+inline auto create_mirror(
+    Kokkos::Impl::WithoutInitializing_t wi,
+    const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror(src, Kokkos::view_alloc(wi));
+}
+
+// Create a mirror in a new space
+template <class Space, class T, class... P>
+inline auto create_mirror(
+    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror(
+      src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
+}
+
+template <class Space, class T, class... P>
+typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
+              const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(wi, typename Space::memory_space{}));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror(src, arg_prop);
+}
+
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    (std::is_same<
+         typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+         typename Kokkos::Experimental::DynamicView<
+             T, P...>::HostMirror::memory_space>::value &&
+     std::is_same<
+         typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+         typename Kokkos::Experimental::DynamicView<
+             T, P...>::HostMirror::data_type>::value),
+    typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
+create_mirror_view(
+    const typename Kokkos::Experimental::DynamicView<T, P...>& src,
+    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+  return src;
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    !(std::is_same<
+          typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+          typename Kokkos::Experimental::DynamicView<
+              T, P...>::HostMirror::memory_space>::value &&
+      std::is_same<
+          typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+          typename Kokkos::Experimental::DynamicView<
+              T, P...>::HostMirror::data_type>::value),
+    typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::create_mirror(arg_prop, src);
+}
+
+template <class Space, class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
+    typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
+create_mirror_view(const Space&,
+                   const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
+}  // namespace Impl
+
+// Create a mirror view in host space
+template <class T, class... P>
+inline auto create_mirror_view(
+    const typename Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{});
+}
+
+template <class T, class... P>
+inline auto create_mirror_view(
+    Kokkos::Impl::WithoutInitializing_t wi,
+    const typename Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(src, Kokkos::view_alloc(wi));
+}
+
+// Create a mirror in a new space
+template <class Space, class T, class... P>
+inline auto create_mirror_view(
+    const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+}
+
+template <class Space, class T, class... P>
+inline auto create_mirror_view(
+    Kokkos::Impl::WithoutInitializing_t wi, const Space&,
+    const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(wi, typename Space::memory_space{}));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror_view(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(src, arg_prop);
+}
+
+template <class T, class... DP, class... SP>
+inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
+                      const Kokkos::Experimental::DynamicView<T, SP...>& src) {
+  using dst_type = Kokkos::Experimental::DynamicView<T, DP...>;
+  using src_type = Kokkos::Experimental::DynamicView<T, SP...>;
+
+  using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
+  using src_execution_space = typename ViewTraits<T, SP...>::execution_space;
+  using dst_memory_space    = typename ViewTraits<T, DP...>::memory_space;
+  using src_memory_space    = typename ViewTraits<T, SP...>::memory_space;
+
+  constexpr bool DstExecCanAccessSrc =
+      Kokkos::SpaceAccessibility<dst_execution_space,
+                                 src_memory_space>::accessible;
+  constexpr bool SrcExecCanAccessDst =
+      Kokkos::SpaceAccessibility<src_execution_space,
+                                 dst_memory_space>::accessible;
+
+  if (DstExecCanAccessSrc)
+    Kokkos::Impl::ViewRemap<dst_type, src_type, dst_execution_space>(dst, src);
+  else if (SrcExecCanAccessDst)
+    Kokkos::Impl::ViewRemap<dst_type, src_type, src_execution_space>(dst, src);
+  else
+    src.impl_get_chunks().deep_copy_to(dst_execution_space{},
+                                       dst.impl_get_chunks());
+  Kokkos::fence("Kokkos::deep_copy(DynamicView)");
+}
+
+template <class ExecutionSpace, class T, class... DP, class... SP>
+inline void deep_copy(const ExecutionSpace& exec,
+                      const Kokkos::Experimental::DynamicView<T, DP...>& dst,
+                      const Kokkos::Experimental::DynamicView<T, SP...>& src) {
+  using dst_type = Kokkos::Experimental::DynamicView<T, DP...>;
+  using src_type = Kokkos::Experimental::DynamicView<T, SP...>;
+
+  using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
+  using src_execution_space = typename ViewTraits<T, SP...>::execution_space;
+  using dst_memory_space    = typename ViewTraits<T, DP...>::memory_space;
+  using src_memory_space    = typename ViewTraits<T, SP...>::memory_space;
+
+  constexpr bool DstExecCanAccessSrc =
+      Kokkos::SpaceAccessibility<dst_execution_space,
+                                 src_memory_space>::accessible;
+  constexpr bool SrcExecCanAccessDst =
+      Kokkos::SpaceAccessibility<src_execution_space,
+                                 dst_memory_space>::accessible;
+
+  // FIXME use execution space
+  if (DstExecCanAccessSrc)
+    Kokkos::Impl::ViewRemap<dst_type, src_type, dst_execution_space>(dst, src);
+  else if (SrcExecCanAccessDst)
+    Kokkos::Impl::ViewRemap<dst_type, src_type, src_execution_space>(dst, src);
+  else
+    src.impl_get_chunks().deep_copy_to(exec, dst.impl_get_chunks());
+}
 
 template <class T, class... DP, class... SP>
 inline void deep_copy(const View<T, DP...>& dst,
@@ -587,6 +884,7 @@ inline void deep_copy(const View<T, DP...>& dst,
     // Copying data between views in accessible memory spaces and either
     // non-contiguous or incompatible shape.
     Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src);
+    Kokkos::fence("Kokkos::deep_copy(DynamicView)");
   } else {
     Kokkos::Impl::throw_runtime_exception(
         "deep_copy given views that would require a temporary allocation");
@@ -612,6 +910,7 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
     // Copying data between views in accessible memory spaces and either
     // non-contiguous or incompatible shape.
     Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src);
+    Kokkos::fence("Kokkos::deep_copy(DynamicView)");
   } else {
     Kokkos::Impl::throw_runtime_exception(
         "deep_copy given views that would require a temporary allocation");
@@ -698,6 +997,105 @@ struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>,
 };
 
 }  // namespace Impl
+
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>&,
+    const Kokkos::Experimental::DynamicView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        Impl::MirrorDynamicViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+
+  // same behavior as deep_copy(src, src)
+  if (!alloc_prop_input::has_execution_space)
+    fence(
+        "Kokkos::create_mirror_view_and_copy: fence before returning src view");
+  return src;
+}
+
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::DynamicView<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        !Impl::MirrorDynamicViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+  using Space = typename alloc_prop_input::memory_space;
+  using Mirror =
+      typename Impl::MirrorDynamicViewType<Space, T, P...>::view_type;
+
+  // Add some properties if not provided to avoid need for if constexpr
+  using alloc_prop = Impl::ViewCtorProp<
+      ViewCtorArgs...,
+      std::conditional_t<alloc_prop_input::has_label,
+                         std::integral_constant<unsigned int, 12>, std::string>,
+      std::conditional_t<!alloc_prop_input::initialize,
+                         std::integral_constant<unsigned int, 13>,
+                         Impl::WithoutInitializing_t>,
+      std::conditional_t<alloc_prop_input::has_execution_space,
+                         std::integral_constant<unsigned int, 14>,
+                         typename Space::execution_space>>;
+  alloc_prop arg_prop_copy(arg_prop);
+
+  std::string& label =
+      static_cast<Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy).value;
+  if (label.empty()) label = src.label();
+  auto mirror = typename Mirror::non_const_type(
+      arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size());
+  mirror.resize_serial(src.extent(0));
+  if (alloc_prop_input::has_execution_space) {
+    using ExecutionSpace = typename alloc_prop::execution_space;
+    deep_copy(
+        static_cast<Impl::ViewCtorProp<void, ExecutionSpace>&>(arg_prop_copy)
+            .value,
+        mirror, src);
+  } else
+    deep_copy(mirror, src);
+  return mirror;
+}
+
+template <class Space, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src,
+    std::string const& name = "") {
+  return create_mirror_view_and_copy(
+      Kokkos::view_alloc(typename Space::memory_space{}, name), src);
+}
+
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNAMICVIEW
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DYNAMICVIEW
+#endif
 #endif /* #ifndef KOKKOS_DYNAMIC_VIEW_HPP */
diff --git a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
index 629b437c2..8affa0bac 100644
--- a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
 #define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ERRORREPORTER
+#endif
 
 #include <vector>
 #include <Kokkos_Core.hpp>
@@ -192,4 +196,8 @@ void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ERRORREPORTER
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ERRORREPORTER
+#endif
 #endif
diff --git a/packages/kokkos/containers/src/Kokkos_Functional.hpp b/packages/kokkos/containers/src/Kokkos_Functional.hpp
index 2e1fa336f..478a087d0 100644
--- a/packages/kokkos/containers/src/Kokkos_Functional.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Functional.hpp
@@ -42,6 +42,10 @@
 
 #ifndef KOKKOS_FUNCTIONAL_HPP
 #define KOKKOS_FUNCTIONAL_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_FUNCTIONAL
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Functional_impl.hpp>
@@ -52,10 +56,12 @@ namespace Kokkos {
 
 template <typename T>
 struct pod_hash {
-  using argument_type        = T;
-  using first_argument_type  = T;
-  using second_argument_type = uint32_t;
-  using result_type          = uint32_t;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using argument_type KOKKOS_DEPRECATED        = T;
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = uint32_t;
+  using result_type KOKKOS_DEPRECATED          = uint32_t;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   uint32_t operator()(T const& t) const {
@@ -70,9 +76,11 @@ struct pod_hash {
 
 template <typename T>
 struct pod_equal_to {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const {
@@ -82,9 +90,11 @@ struct pod_equal_to {
 
 template <typename T>
 struct pod_not_equal_to {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const {
@@ -94,9 +104,11 @@ struct pod_not_equal_to {
 
 template <typename T>
 struct equal_to {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a == b; }
@@ -104,9 +116,11 @@ struct equal_to {
 
 template <typename T>
 struct not_equal_to {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a != b; }
@@ -114,9 +128,11 @@ struct not_equal_to {
 
 template <typename T>
 struct greater {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a > b; }
@@ -124,9 +140,11 @@ struct greater {
 
 template <typename T>
 struct less {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a < b; }
@@ -134,9 +152,11 @@ struct less {
 
 template <typename T>
 struct greater_equal {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a >= b; }
@@ -144,9 +164,11 @@ struct greater_equal {
 
 template <typename T>
 struct less_equal {
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  using result_type          = bool;
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using first_argument_type KOKKOS_DEPRECATED  = T;
+  using second_argument_type KOKKOS_DEPRECATED = T;
+  using result_type KOKKOS_DEPRECATED          = bool;
+#endif
 
   KOKKOS_FORCEINLINE_FUNCTION
   bool operator()(T const& a, T const& b) const { return a <= b; }
@@ -154,4 +176,8 @@ struct less_equal {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_FUNCTIONAL
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_FUNCTIONAL
+#endif
 #endif  // KOKKOS_FUNCTIONAL_HPP
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 9d97dc08f..0b54d1bdd 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -7,6 +7,10 @@
 
 #ifndef KOKKOS_OFFSETVIEW_HPP_
 #define KOKKOS_OFFSETVIEW_HPP_
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_OFFSETVIEW
+#endif
 
 #include <Kokkos_Core.hpp>
 
@@ -25,26 +29,25 @@ template <class>
 struct is_offset_view : public std::false_type {};
 
 template <class D, class... P>
-struct is_offset_view<OffsetView<D, P...> > : public std::true_type {};
+struct is_offset_view<OffsetView<D, P...>> : public std::true_type {};
 
 template <class D, class... P>
-struct is_offset_view<const OffsetView<D, P...> > : public std::true_type {};
+struct is_offset_view<const OffsetView<D, P...>> : public std::true_type {};
 
 #define KOKKOS_INVALID_OFFSET int64_t(0x7FFFFFFFFFFFFFFFLL)
 #define KOKKOS_INVALID_INDEX_RANGE \
   { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET }
 
-template <typename iType,
-          typename std::enable_if<std::is_integral<iType>::value &&
-                                      std::is_signed<iType>::value,
-                                  iType>::type = 0>
+template <typename iType, std::enable_if_t<std::is_integral<iType>::value &&
+                                               std::is_signed<iType>::value,
+                                           iType> = 0>
 using IndexRange = Kokkos::Array<iType, 2>;
 
 using index_list_type = std::initializer_list<int64_t>;
 
 //  template <typename iType,
-//    typename std::enable_if< std::is_integral<iType>::value &&
-//      std::is_signed<iType>::value, iType >::type = 0> using min_index_type =
+//    std::enable_if_t< std::is_integral<iType>::value &&
+//      std::is_signed<iType>::value, iType > = 0> using min_index_type =
 //      std::initializer_list<iType>;
 
 namespace Impl {
@@ -191,9 +194,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   enum { Rank = map_type::Rank };
   using begins_type = Kokkos::Array<int64_t, Rank>;
 
-  template <
-      typename iType,
-      typename std::enable_if<std::is_integral<iType>::value, iType>::type = 0>
+  template <typename iType,
+            std::enable_if_t<std::is_integral<iType>::value, iType> = 0>
   KOKKOS_INLINE_FUNCTION int64_t begin(const iType local_dimension) const {
     return local_dimension < Rank ? m_begins[local_dimension]
                                   : KOKKOS_INVALID_OFFSET;
@@ -202,9 +204,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   KOKKOS_INLINE_FUNCTION
   begins_type begins() const { return m_begins; }
 
-  template <
-      typename iType,
-      typename std::enable_if<std::is_integral<iType>::value, iType>::type = 0>
+  template <typename iType,
+            std::enable_if_t<std::is_integral<iType>::value, iType> = 0>
   KOKKOS_INLINE_FUNCTION int64_t end(const iType local_dimension) const {
     return begin(local_dimension) + m_map.extent(local_dimension);
   }
@@ -249,16 +250,16 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   // constexpr unsigned rank() { return map_type::Rank; }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      extent(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  extent(const iType& r) const {
     return m_map.extent(r);
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, int>::type
-      extent_int(const iType& r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, int>
+  extent_int(const iType& r) const {
     return static_cast<int>(m_map.extent(r));
   }
 
@@ -299,9 +300,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      stride(iType r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  stride(iType r) const {
     return (
         r == 0
             ? m_map.stride_0()
@@ -368,7 +369,7 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
       std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value;
 
   static constexpr bool is_default_map =
-      std::is_same<typename traits::specialize, void>::value &&
+      std::is_void<typename traits::specialize>::value &&
       (is_layout_left || is_layout_right || is_layout_stride);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
@@ -401,11 +402,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   // Rank 1 operator()
 
   template <typename I0>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator()(const I0& i0) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && !is_default_map),
+      reference_type>
+  operator()(const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
     return m_map.reference(j0);
@@ -413,10 +413,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               !is_layout_stride),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) &&
+                        is_default_map && !is_layout_stride),
+                       reference_type>
       operator()(const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
@@ -425,10 +424,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) &&
+                        is_default_map && is_layout_stride),
+                       reference_type>
       operator()(const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
@@ -438,11 +436,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   // Rank 1 operator[]
 
   template <typename I0>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator[](const I0& i0) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && !is_default_map),
+      reference_type>
+  operator[](const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
     return m_map.reference(j0);
@@ -450,10 +447,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               !is_layout_stride),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) &&
+                        is_default_map && !is_layout_stride),
+                       reference_type>
       operator[](const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
@@ -462,10 +458,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) &&
+                        is_default_map && is_layout_stride),
+                       reference_type>
       operator[](const I0& i0) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0))
     const size_t j0 = i0 - m_begins[0];
@@ -477,9 +472,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && !is_default_map),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1>::value &&
+                        (2 == Rank) && !is_default_map),
+                       reference_type>
       operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
@@ -488,12 +483,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_left && (traits::rank_dynamic == 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) &&
+       is_default_map && is_layout_left && (traits::rank_dynamic == 0)),
+      reference_type>
+  operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
     const size_t j1 = i1 - m_begins[1];
@@ -501,12 +495,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_left && (traits::rank_dynamic != 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) &&
+       is_default_map && is_layout_left && (traits::rank_dynamic != 0)),
+      reference_type>
+  operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
     const size_t j1 = i1 - m_begins[1];
@@ -514,12 +507,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_right && (traits::rank_dynamic == 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) &&
+       is_default_map && is_layout_right && (traits::rank_dynamic == 0)),
+      reference_type>
+  operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
     const size_t j1 = i1 - m_begins[1];
@@ -527,12 +519,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_right && (traits::rank_dynamic != 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) &&
+       is_default_map && is_layout_right && (traits::rank_dynamic != 0)),
+      reference_type>
+  operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
     const size_t j1 = i1 - m_begins[1];
@@ -541,10 +532,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1>::value &&
+                        (2 == Rank) && is_default_map && is_layout_stride),
+                       reference_type>
       operator()(const I0& i0, const I1& i1) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1))
     const size_t j0 = i0 - m_begins[0];
@@ -558,9 +548,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
-                               (3 == Rank) && is_default_map),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
+                        (3 == Rank) && is_default_map),
+                       reference_type>
       operator()(const I0& i0, const I1& i1, const I2& i2) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2))
@@ -572,9 +562,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
-                               (3 == Rank) && !is_default_map),
-                              reference_type>::type
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
+                        (3 == Rank) && !is_default_map),
+                       reference_type>
       operator()(const I0& i0, const I1& i1, const I2& i2) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2))
@@ -588,11 +578,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   // Rank 4
 
   template <typename I0, typename I1, typename I2, typename I3>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) &&
-       is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3>::value &&
+                        (4 == Rank) && is_default_map),
+                       reference_type>
+      operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2, i3))
     const size_t j0 = i0 - m_begins[0];
@@ -603,11 +593,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1, typename I2, typename I3>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3>::value &&
+                        (4 == Rank) && !is_default_map),
+                       reference_type>
+      operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2, i3))
     const size_t j0 = i0 - m_begins[0];
@@ -621,12 +611,12 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   // Rank 5
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) &&
-       is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4) const {
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value &&
+                        (5 == Rank) && is_default_map),
+                       reference_type>
+      operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
+                 const I4& i4) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2, i3, i4))
     const size_t j0 = i0 - m_begins[0];
@@ -638,12 +628,12 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4) const {
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value &&
+                        (5 == Rank) && !is_default_map),
+                       reference_type>
+      operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
+                 const I4& i4) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
         (m_track, m_map, m_begins, i0, i1, i2, i3, i4))
     const size_t j0 = i0 - m_begins[0];
@@ -659,10 +649,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value &&
        (6 == Rank) && is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -678,10 +668,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value &&
        (6 == Rank) && !is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -700,10 +690,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5, typename I6>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value &&
        (7 == Rank) && is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5, const I6& i6) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -720,10 +710,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5, typename I6>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value &&
        (7 == Rank) && !is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5, const I6& i6) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -743,10 +733,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5, typename I6, typename I7>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value &&
        (8 == Rank) && is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5, const I6& i6, const I7& i7) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -765,10 +755,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
             typename I5, typename I6, typename I7>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
       (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value &&
        (8 == Rank) && !is_default_map),
-      reference_type>::type
+      reference_type>
   operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
              const I4& i4, const I5& i5, const I6& i6, const I7& i7) const {
     KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(
@@ -1104,40 +1094,75 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     return m_track.template get_label<typename traits::memory_space>();
   }
 
+  // Choosing std::pair as type for the arguments allows constructing an
+  // OffsetView using list initialization syntax, e.g.,
+  //   OffsetView dummy("dummy", {-1, 3}, {-2,2});
+  // We could allow arbitrary types RangeType that support
+  // std::get<{0,1}>(RangeType const&) with std::tuple_size<RangeType>::value==2
+  // but this wouldn't allow using the syntax in the example above.
   template <typename Label>
   explicit inline OffsetView(
       const Label& arg_label,
-      typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value,
-                              const index_list_type>::type range0,
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value,
+                       const std::pair<int64_t, int64_t>>
+          range0,
+      const std::pair<int64_t, int64_t> range1 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range2 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range3 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range4 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range5 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range6 = KOKKOS_INVALID_INDEX_RANGE,
+      const std::pair<int64_t, int64_t> range7 = KOKKOS_INVALID_INDEX_RANGE
+
+      )
+      : OffsetView(
+            Kokkos::Impl::ViewCtorProp<std::string>(arg_label),
+            typename traits::array_layout(range0.second - range0.first + 1,
+                                          range1.second - range1.first + 1,
+                                          range2.second - range2.first + 1,
+                                          range3.second - range3.first + 1,
+                                          range4.second - range4.first + 1,
+                                          range5.second - range5.first + 1,
+                                          range6.second - range6.first + 1,
+                                          range7.second - range7.first + 1),
+            {range0.first, range1.first, range2.first, range3.first,
+             range4.first, range5.first, range6.first, range7.first}) {}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  template <typename Label>
+  KOKKOS_DEPRECATED_WITH_COMMENT(
+      "Use the constructor taking std::pair<int64_t, int64_t> arguments "
+      "instead!")
+  explicit inline OffsetView(
+      const Label& arg_label,
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value,
+                       const index_list_type>
+          range0,
       const index_list_type range1 = KOKKOS_INVALID_INDEX_RANGE,
       const index_list_type range2 = KOKKOS_INVALID_INDEX_RANGE,
       const index_list_type range3 = KOKKOS_INVALID_INDEX_RANGE,
       const index_list_type range4 = KOKKOS_INVALID_INDEX_RANGE,
       const index_list_type range5 = KOKKOS_INVALID_INDEX_RANGE,
       const index_list_type range6 = KOKKOS_INVALID_INDEX_RANGE,
-      const index_list_type range7 = KOKKOS_INVALID_INDEX_RANGE
-
-      )
-      : OffsetView(Kokkos::Impl::ViewCtorProp<std::string>(arg_label),
-                   typename traits::array_layout(
-                       range0.begin()[1] - range0.begin()[0] + 1,
-                       range1.begin()[1] - range1.begin()[0] + 1,
-                       range2.begin()[1] - range2.begin()[0] + 1,
-                       range3.begin()[1] - range3.begin()[0] + 1,
-                       range4.begin()[1] - range4.begin()[0] + 1,
-                       range5.begin()[1] - range5.begin()[0] + 1,
-                       range6.begin()[1] - range6.begin()[0] + 1,
-                       range7.begin()[1] - range7.begin()[0] + 1),
-                   {range0.begin()[0], range1.begin()[0], range2.begin()[0],
-                    range3.begin()[0], range4.begin()[0], range5.begin()[0],
-                    range6.begin()[0], range7.begin()[0]}) {}
+      const index_list_type range7 = KOKKOS_INVALID_INDEX_RANGE)
+      : OffsetView(
+            arg_label,
+            std::pair<int64_t, int64_t>(range0.begin()[0], range0.begin()[1]),
+            std::pair<int64_t, int64_t>(range1.begin()[0], range1.begin()[1]),
+            std::pair<int64_t, int64_t>(range2.begin()[0], range2.begin()[1]),
+            std::pair<int64_t, int64_t>(range3.begin()[0], range3.begin()[1]),
+            std::pair<int64_t, int64_t>(range4.begin()[0], range4.begin()[1]),
+            std::pair<int64_t, int64_t>(range5.begin()[0], range5.begin()[1]),
+            std::pair<int64_t, int64_t>(range6.begin()[0], range6.begin()[1]),
+            std::pair<int64_t, int64_t>(range7.begin()[0], range7.begin()[1])) {
+  }
+#endif
 
   template <class... P>
   explicit KOKKOS_INLINE_FUNCTION OffsetView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout,
+      std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout,
       const index_list_type minIndices)
       : m_track()  // No memory tracking
         ,
@@ -1155,9 +1180,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit inline OffsetView(
       const Kokkos::Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout,
+      std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout,
       const index_list_type minIndices)
       : m_track(),
         m_map()
@@ -1172,17 +1196,14 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     // to avoid duplicate class error.
     using alloc_prop = Kokkos::Impl::ViewCtorProp<
         P...,
-        typename std::conditional<alloc_prop_input::has_label,
-                                  std::integral_constant<unsigned, 0>,
-                                  typename std::string>::type,
-        typename std::conditional<
-            alloc_prop_input::has_memory_space,
-            std::integral_constant<unsigned, 1>,
-            typename traits::device_type::memory_space>::type,
-        typename std::conditional<
-            alloc_prop_input::has_execution_space,
-            std::integral_constant<unsigned, 2>,
-            typename traits::device_type::execution_space>::type>;
+        std::conditional_t<alloc_prop_input::has_label,
+                           std::integral_constant<unsigned, 0>, std::string>,
+        std::conditional_t<alloc_prop_input::has_memory_space,
+                           std::integral_constant<unsigned, 1>,
+                           typename traits::device_type::memory_space>,
+        std::conditional_t<alloc_prop_input::has_execution_space,
+                           std::integral_constant<unsigned, 2>,
+                           typename traits::device_type::execution_space>>;
 
     static_assert(traits::is_managed,
                   "OffsetView allocation constructor requires managed memory");
@@ -1214,8 +1235,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #endif
     //------------------------------------------------------------
 
-    Kokkos::Impl::SharedAllocationRecord<>* record =
-        m_map.allocate_shared(prop_copy, arg_layout);
+    Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
+        prop_copy, arg_layout,
+        Kokkos::Impl::ViewCtorProp<P...>::has_execution_space);
 
     //------------------------------------------------------------
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -1252,9 +1274,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView<D, P...>& V) {
 namespace Impl {
 
 template <class T>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<std::is_integral<T>::value, T>::type
-    shift_input(const T arg, const int64_t offset) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, T>
+shift_input(const T arg, const int64_t offset) {
   return arg - offset;
 }
 
@@ -1265,22 +1286,21 @@ Kokkos::Impl::ALL_t shift_input(const Kokkos::Impl::ALL_t arg,
 }
 
 template <class T>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<std::is_integral<T>::value,
-                                               Kokkos::pair<T, T> >::type
-shift_input(const Kokkos::pair<T, T> arg, const int64_t offset) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<std::is_integral<T>::value, Kokkos::pair<T, T>>
+    shift_input(const Kokkos::pair<T, T> arg, const int64_t offset) {
   return Kokkos::make_pair<T, T>(arg.first - offset, arg.second - offset);
 }
 template <class T>
-inline
-    typename std::enable_if<std::is_integral<T>::value, std::pair<T, T> >::type
-    shift_input(const std::pair<T, T> arg, const int64_t offset) {
+inline std::enable_if_t<std::is_integral<T>::value, std::pair<T, T>>
+shift_input(const std::pair<T, T> arg, const int64_t offset) {
   return std::make_pair<T, T>(arg.first - offset, arg.second - offset);
 }
 
 template <size_t N, class Arg, class A>
 KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin(
     const size_t i, Kokkos::Array<int64_t, N>& subviewBegins,
-    typename std::enable_if<N != 0, const Arg>::type shiftedArg, const Arg arg,
+    std::enable_if_t<N != 0, const Arg> shiftedArg, const Arg arg,
     const A viewBegins, size_t& counter) {
   if (!std::is_integral<Arg>::value) {
     subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0;
@@ -1291,8 +1311,8 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin(
 template <size_t N, class Arg, class A>
 KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin(
     const size_t /*i*/, Kokkos::Array<int64_t, N>& /*subviewBegins*/,
-    typename std::enable_if<N == 0, const Arg>::type /*shiftedArg*/,
-    const Arg /*arg*/, const A /*viewBegins*/, size_t& /*counter*/) {}
+    std::enable_if_t<N == 0, const Arg> /*shiftedArg*/, const Arg /*arg*/,
+    const A /*viewBegins*/, size_t& /*counter*/) {}
 
 template <class D, class... P, class T>
 KOKKOS_INLINE_FUNCTION
@@ -1774,9 +1794,8 @@ template <class DT, class... DP>
 inline void deep_copy(
     const Experimental::OffsetView<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   static_assert(
       std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type,
                    typename ViewTraits<DT, DP...>::value_type>::value,
@@ -1790,9 +1809,8 @@ template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const Experimental::OffsetView<DT, DP...>& dst,
     const Experimental::OffsetView<ST, SP...>& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   static_assert(
       std::is_same<typename ViewTraits<DT, DP...>::value_type,
                    typename ViewTraits<ST, SP...>::non_const_value_type>::value,
@@ -1805,9 +1823,8 @@ template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const Experimental::OffsetView<DT, DP...>& dst,
     const View<ST, SP...>& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   static_assert(
       std::is_same<typename ViewTraits<DT, DP...>::value_type,
                    typename ViewTraits<ST, SP...>::non_const_value_type>::value,
@@ -1821,9 +1838,8 @@ template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const View<DT, DP...>& dst,
     const Experimental::OffsetView<ST, SP...>& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   static_assert(
       std::is_same<typename ViewTraits<DT, DP...>::value_type,
                    typename ViewTraits<ST, SP...>::non_const_value_type>::value,
@@ -1856,8 +1872,8 @@ struct MirrorOffsetViewType {
       Kokkos::Experimental::OffsetView<data_type, array_layout, Space>;
   // If it is the same memory_space return the existing view_type
   // This will also keep the unmanaged trait if necessary
-  using view_type = typename std::conditional<is_same_memspace, src_view_type,
-                                              dest_view_type>::type;
+  using view_type =
+      std::conditional_t<is_same_memspace, src_view_type, dest_view_type>;
 };
 
 template <class Space, class T, class... P>
@@ -1883,163 +1899,202 @@ struct MirrorOffsetType {
 
 }  // namespace Impl
 
-template <class T, class... P>
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
 inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
-create_mirror(
-    const Kokkos::Experimental::OffsetView<T, P...>& src,
-    typename std::enable_if<
-        !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                      Kokkos::LayoutStride>::value>::type* = nullptr) {
-  using src_type = Experimental::OffsetView<T, P...>;
-  using dst_type = typename src_type::HostMirror;
-
-  return dst_type(
-      Kokkos::Impl::ViewCtorProp<std::string>(
-          std::string(src.label()).append("_mirror")),
-      typename Kokkos::ViewTraits<T, P...>::array_layout(
-          src.extent(0), src.extent(1), src.extent(2), src.extent(3),
-          src.extent(4), src.extent(5), src.extent(6), src.extent(7)),
+create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
+              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
+      Kokkos::create_mirror(arg_prop, src.view()), src.begins());
+}
+
+template <class Space, class T, class... P, class... ViewCtorArgs>
+inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
+create_mirror(const Space&,
+              const Kokkos::Experimental::OffsetView<T, P...>& src,
+              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a memory space instance!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type(
+      prop_copy, src.layout(),
       {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
        src.begin(5), src.begin(6), src.begin(7)});
 }
+}  // namespace Impl
+
+// Create a mirror in host space
+template <class T, class... P>
+inline auto create_mirror(
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(src, Impl::ViewCtorProp<>{});
+}
 
 template <class T, class... P>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
-create_mirror(
-    const Kokkos::Experimental::OffsetView<T, P...>& src,
-    typename std::enable_if<
-        std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                     Kokkos::LayoutStride>::value>::type* = nullptr) {
-  using src_type = Experimental::OffsetView<T, P...>;
-  using dst_type = typename src_type::HostMirror;
-
-  Kokkos::LayoutStride layout;
-
-  layout.dimension[0] = src.extent(0);
-  layout.dimension[1] = src.extent(1);
-  layout.dimension[2] = src.extent(2);
-  layout.dimension[3] = src.extent(3);
-  layout.dimension[4] = src.extent(4);
-  layout.dimension[5] = src.extent(5);
-  layout.dimension[6] = src.extent(6);
-  layout.dimension[7] = src.extent(7);
-
-  layout.stride[0] = src.stride_0();
-  layout.stride[1] = src.stride_1();
-  layout.stride[2] = src.stride_2();
-  layout.stride[3] = src.stride_3();
-  layout.stride[4] = src.stride_4();
-  layout.stride[5] = src.stride_5();
-  layout.stride[6] = src.stride_6();
-  layout.stride[7] = src.stride_7();
-
-  return dst_type(std::string(src.label()).append("_mirror"), layout,
-                  {src.begin(0), src.begin(1), src.begin(2), src.begin(3),
-                   src.begin(4), src.begin(5), src.begin(6), src.begin(7)});
+inline auto create_mirror(
+    Kokkos::Impl::WithoutInitializing_t wi,
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(src, Kokkos::view_alloc(wi));
+}
+
+// Create a mirror in a new space
+template <class Space, class T, class... P,
+          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+inline auto create_mirror(
+    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
 }
 
-// Create a mirror in a new space (specialization for different space)
 template <class Space, class T, class... P>
 typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(const Space&,
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
               const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type(
-      src.label(), src.layout(),
-      {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
-       src.begin(5), src.begin(6), src.begin(7)});
+  return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
 }
 
-template <class T, class... P>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(src, arg_prop);
+}
+
+namespace Impl {
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    (std::is_same<
+         typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+         typename Kokkos::Experimental::OffsetView<
+             T, P...>::HostMirror::memory_space>::value &&
+     std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+                  typename Kokkos::Experimental::OffsetView<
+                      T, P...>::HostMirror::data_type>::value),
+    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror_view(
     const typename Kokkos::Experimental::OffsetView<T, P...>& src,
-    typename std::enable_if<
-        (std::is_same<
-             typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-             typename Kokkos::Experimental::OffsetView<
-                 T, P...>::HostMirror::memory_space>::value &&
-         std::is_same<
-             typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-             typename Kokkos::Experimental::OffsetView<
-                 T, P...>::HostMirror::data_type>::value)>::type* = nullptr) {
+    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
-template <class T, class... P>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
-create_mirror_view(
-    const Kokkos::Experimental::OffsetView<T, P...>& src,
-    typename std::enable_if<
-        !(std::is_same<
-              typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-              typename Kokkos::Experimental::OffsetView<
-                  T, P...>::HostMirror::memory_space>::value &&
-          std::is_same<
-              typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-              typename Kokkos::Experimental::OffsetView<
-                  T, P...>::HostMirror::data_type>::value)>::type* = nullptr) {
-  return Kokkos::create_mirror(src);
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    !(std::is_same<
+          typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+          typename Kokkos::Experimental::OffsetView<
+              T, P...>::HostMirror::memory_space>::value &&
+      std::is_same<
+          typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+          typename Kokkos::Experimental::OffsetView<
+              T, P...>::HostMirror::data_type>::value),
+    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::create_mirror(arg_prop, src);
 }
 
-// Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P>
-typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
+template <class Space, class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
+    Kokkos::Experimental::OffsetView<T, P...>>
 create_mirror_view(const Space&,
                    const Kokkos::Experimental::OffsetView<T, P...>& src,
-                   typename std::enable_if<Impl::MirrorOffsetViewType<
-                       Space, T, P...>::is_same_memspace>::type* = nullptr) {
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
-// Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P>
-typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
-create_mirror_view(const Space&,
+template <class Space, class T, class... P, class... ViewCtorArgs>
+std::enable_if_t<
+    !Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
+    typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
+create_mirror_view(const Space& space,
                    const Kokkos::Experimental::OffsetView<T, P...>& src,
-                   typename std::enable_if<!Impl::MirrorOffsetViewType<
-                       Space, T, P...>::is_same_memspace>::type* = nullptr) {
-  return typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type(
-      src.label(), src.layout(),
-      {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
-       src.begin(5), src.begin(6), src.begin(7)});
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return create_mirror(space, src, arg_prop);
+}
+}  // namespace Impl
+
+// Create a mirror view in host space
+template <class T, class... P>
+inline auto create_mirror_view(
+    const typename Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{});
+}
+
+template <class T, class... P>
+inline auto create_mirror_view(
+    Kokkos::Impl::WithoutInitializing_t wi,
+    const typename Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(src, Kokkos::view_alloc(wi));
+}
+
+// Create a mirror view in a new space
+template <class Space, class T, class... P,
+          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+inline auto create_mirror_view(
+    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
 }
-//
-//  // Create a mirror view and deep_copy in a new space (specialization for
-//  same space) template<class Space, class T, class ... P> typename
-//  Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type
-//  create_mirror_view_and_copy(const Space& , const
-//  Kokkos::Experimental::OffsetView<T,P...> & src
-//                              , std::string const& name = ""
-//                                  , typename
-//                                  std::enable_if<Impl::MirrorViewType<Space,T,P
-//                                  ...>::is_same_memspace>::type* = nullptr) {
-//    (void)name;
-//    return src;
-//  }
-//
-//  // Create a mirror view and deep_copy in a new space (specialization for
-//  different space) template<class Space, class T, class ... P> typename
-//  Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type
-//  create_mirror_view_and_copy(const Space& , const
-//  Kokkos::Experimental::OffsetView<T,P...> & src
-//                              , std::string const& name = ""
-//                                  , typename
-//                                  std::enable_if<!Impl::MirrorViewType<Space,T,P
-//                                  ...>::is_same_memspace>::type* = nullptr) {
-//    using Mirror = typename
-//    Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type;
-//    std::string label = name.empty() ? src.label() : name;
-//    auto mirror = Mirror(view_alloc(WithoutInitializing, label), src.layout(),
-//                         { src.begin(0), src.begin(1), src.begin(2),
-//                         src.begin(3), src.begin(4),
-//                             src.begin(5), src.begin(6), src.begin(7) });
-//    deep_copy(mirror, src);
-//    return mirror;
-//  }
 
+template <class Space, class T, class... P>
+inline auto create_mirror_view(
+    Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline auto create_mirror_view(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(src, arg_prop);
+}
+
+// Create a mirror view and deep_copy in a new space
+template <class... ViewCtorArgs, class T, class... P>
+typename Kokkos::Impl::MirrorOffsetViewType<
+    typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+    P...>::view_type
+create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return {create_mirror_view_and_copy(arg_prop, src.view()), src.begins()};
+}
+
+template <class Space, class T, class... P>
+typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
+create_mirror_view_and_copy(
+    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src,
+    std::string const& name = "") {
+  return {create_mirror_view_and_copy(space, src.view(), name), src.begins()};
+}
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_OFFSETVIEW
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_OFFSETVIEW
+#endif
 #endif /* KOKKOS_OFFSETVIEW_HPP_ */
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index e4dd9531f..a9529d1c8 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -50,6 +50,10 @@
 
 #ifndef KOKKOS_SCATTER_VIEW_HPP
 #define KOKKOS_SCATTER_VIEW_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SCATTERVIEW
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <utility>
@@ -300,11 +304,6 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, DeviceType,
     Kokkos::atomic_add(&dest, src);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile ValueType& dest, const volatile ValueType& src) const {
-    Kokkos::atomic_add(&dest, src);
-  }
-
   KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
     this->join(value, rhs);
   }
@@ -374,11 +373,6 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, DeviceType,
     atomic_prod(&dest, src);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile ValueType& dest, const volatile ValueType& src) const {
-    atomic_prod(&dest, src);
-  }
-
   KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
     atomic_prod(&value, rhs);
   }
@@ -433,11 +427,6 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, DeviceType,
     atomic_min(&dest, src);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile ValueType& dest, const volatile ValueType& src) const {
-    atomic_min(dest, src);
-  }
-
   KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
     this->join(value, rhs);
   }
@@ -492,11 +481,6 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, DeviceType,
     atomic_max(&dest, src);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile ValueType& dest, const volatile ValueType& src) const {
-    atomic_max(dest, src);
-  }
-
   KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) {
     this->join(value, rhs);
   }
@@ -836,6 +820,19 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     ::Kokkos::resize(internal_view, n0, n1, n2, n3, n4, n5, n6, n7);
   }
 
+  template <class... ViewCtorArgs>
+  void resize(const ::Kokkos::Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+              const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    ::Kokkos::resize(arg_prop, internal_view, n0, n1, n2, n3, n4, n5, n6, n7);
+  }
+
   template <class I>
   std::enable_if_t<Kokkos::Impl::is_view_ctor_property<I>::value> resize(
       const I& arg_prop, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -849,6 +846,19 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     ::Kokkos::resize(arg_prop, internal_view, n0, n1, n2, n3, n4, n5, n6, n7);
   }
 
+  template <class... ViewCtorArgs>
+  void realloc(const Kokkos::Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+               const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    ::Kokkos::realloc(arg_prop, internal_view, n0, n1, n2, n3, n4, n5, n6, n7);
+  }
+
   void realloc(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -909,11 +919,10 @@ class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterNonDuplicated,
   }
 
   template <typename Arg>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<view_type::original_view_type::rank == 1 &&
-                                  std::is_integral<Arg>::value,
-                              value_type>::type
-      operator[](Arg arg) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      view_type::original_view_type::rank == 1 && std::is_integral<Arg>::value,
+      value_type>
+  operator[](Arg arg) const {
     return view.at(arg);
   }
 
@@ -1108,6 +1117,19 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                      n6);
   }
 
+  template <class... ViewCtorArgs>
+  void resize(const ::Kokkos::Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+              const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+              const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    ::Kokkos::resize(arg_prop, internal_view, unique_token.size(), n0, n1, n2,
+                     n3, n4, n5, n6);
+  }
+
   template <class I>
   std::enable_if_t<Kokkos::Impl::is_view_ctor_property<I>::value> resize(
       const I& arg_prop, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1121,6 +1143,19 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                      n3, n4, n5, n6);
   }
 
+  template <class... ViewCtorArgs>
+  void realloc(const ::Kokkos::Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+               const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    ::Kokkos::realloc(arg_prop, internal_view, unique_token.size(), n0, n1, n2,
+                      n3, n4, n5, n6);
+  }
+
   void realloc(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1443,11 +1478,10 @@ class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterDuplicated,
   }
 
   template <typename Arg>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<view_type::original_view_type::rank == 1 &&
-                                  std::is_integral<Arg>::value,
-                              value_type>::type
-      operator[](Arg arg) const {
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      view_type::original_view_type::rank == 1 && std::is_integral<Arg>::value,
+      value_type>
+  operator[](Arg arg) const {
     return view.at(thread_id, arg);
   }
 
@@ -1482,16 +1516,16 @@ ScatterView<
     RT, typename ViewTraits<RT, RP...>::array_layout,
     typename ViewTraits<RT, RP...>::device_type, Op,
     std::conditional_t<
-        std::is_same<Duplication, void>::value,
+        std::is_void<Duplication>::value,
         typename Kokkos::Impl::Experimental::DefaultDuplication<
             typename ViewTraits<RT, RP...>::execution_space>::type,
         Duplication>,
     std::conditional_t<
-        std::is_same<Contribution, void>::value,
+        std::is_void<Contribution>::value,
         typename Kokkos::Impl::Experimental::DefaultContribution<
             typename ViewTraits<RT, RP...>::execution_space,
             typename std::conditional_t<
-                std::is_same<Duplication, void>::value,
+                std::is_void<Duplication>::value,
                 typename Kokkos::Impl::Experimental::DefaultDuplication<
                     typename ViewTraits<RT, RP...>::execution_space>::type,
                 Duplication>>::type,
@@ -1552,6 +1586,15 @@ void contribute(
 
 namespace Kokkos {
 
+template <typename DT, typename LY, typename ES, typename OP, typename CT,
+          typename DP, typename... IS, class... ViewCtorArgs>
+void realloc(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view,
+    IS... is) {
+  scatter_view.realloc(arg_prop, is...);
+}
+
 template <typename DT, typename LY, typename ES, typename OP, typename CT,
           typename DP, typename... IS>
 void realloc(
@@ -1577,6 +1620,15 @@ void resize(
   scatter_view.resize(is...);
 }
 
+template <class... ViewCtorArgs, typename DT, typename LY, typename ES,
+          typename OP, typename CT, typename DP, typename... IS>
+void resize(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view,
+    IS... is) {
+  scatter_view.resize(arg_prop, is...);
+}
+
 template <typename I, typename DT, typename LY, typename ES, typename OP,
           typename CT, typename DP, typename... IS>
 std::enable_if_t<Kokkos::Impl::is_view_ctor_property<I>::value> resize(
@@ -1588,4 +1640,8 @@ std::enable_if_t<Kokkos::Impl::is_view_ctor_property<I>::value> resize(
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SCATTERVIEW
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SCATTERVIEW
+#endif
 #endif
diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index cd633e403..219b08b4b 100644
--- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_STATICCRSGRAPH_HPP
 #define KOKKOS_STATICCRSGRAPH_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH
+#endif
 
 #include <string>
 #include <vector>
@@ -214,8 +218,7 @@ struct GraphRowViewConst {
       const typename GraphType::entries_type& colidx_in,
       const ordinal_type& stride, const ordinal_type& count,
       const OffsetType& idx,
-      const typename std::enable_if<std::is_integral<OffsetType>::value,
-                                    int>::type& = 0)
+      const std::enable_if_t<std::is_integral<OffsetType>::value, int>& = 0)
       : colidx_(&colidx_in(idx)), stride_(stride), length(count) {}
 
   /// \brief Number of entries in the row.
@@ -471,8 +474,7 @@ struct StaticCrsGraphMaximumEntry {
   void init(value_type& update) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
+  void join(value_type& update, const value_type& input) const {
     if (update < input) update = input;
   }
 };
@@ -498,4 +500,8 @@ DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH
+#endif
 #endif /* #ifndef KOKKOS_CRSARRAY_HPP */
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index fbef0a013..6c112644c 100644
--- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -50,6 +50,10 @@
 
 #ifndef KOKKOS_UNORDERED_MAP_HPP
 #define KOKKOS_UNORDERED_MAP_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_UNORDEREDMAP
+#endif
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Functional.hpp>
@@ -62,7 +66,6 @@
 #include <iostream>
 
 #include <cstdint>
-#include <stdexcept>
 
 namespace Kokkos {
 
@@ -200,10 +203,9 @@ class UnorderedMapInsertResult {
 ///   <tt>Key</tt>.  The default will do a bitwise equality comparison.
 ///
 template <typename Key, typename Value,
-          typename Device = Kokkos::DefaultExecutionSpace,
-          typename Hasher = pod_hash<typename std::remove_const<Key>::type>,
-          typename EqualTo =
-              pod_equal_to<typename std::remove_const<Key>::type>>
+          typename Device  = Kokkos::DefaultExecutionSpace,
+          typename Hasher  = pod_hash<std::remove_const_t<Key>>,
+          typename EqualTo = pod_equal_to<std::remove_const_t<Key>>>
 class UnorderedMap {
  private:
   using host_mirror_space =
@@ -215,13 +217,13 @@ class UnorderedMap {
 
   // key_types
   using declared_key_type = Key;
-  using key_type          = typename std::remove_const<declared_key_type>::type;
-  using const_key_type    = typename std::add_const<key_type>::type;
+  using key_type          = std::remove_const_t<declared_key_type>;
+  using const_key_type    = std::add_const_t<key_type>;
 
   // value_types
   using declared_value_type = Value;
-  using value_type = typename std::remove_const<declared_value_type>::type;
-  using const_value_type = typename std::add_const<value_type>::type;
+  using value_type          = std::remove_const_t<declared_value_type>;
+  using const_value_type    = std::add_const_t<value_type>;
 
   using device_type     = Device;
   using execution_space = typename Device::execution_space;
@@ -241,7 +243,7 @@ class UnorderedMap {
   using const_map_type = UnorderedMap<const_key_type, const_value_type,
                                       device_type, hasher_type, equal_to_type>;
 
-  static const bool is_set = std::is_same<void, value_type>::value;
+  static const bool is_set = std::is_void<value_type>::value;
   static const bool has_const_key =
       std::is_same<const_key_type, declared_key_type>::value;
   static const bool has_const_value =
@@ -318,7 +320,7 @@ class UnorderedMap {
 #endif
         m_scalars("UnorderedMap scalars") {
     if (!is_insertable_map) {
-      throw std::runtime_error(
+      Kokkos::Impl::throw_runtime_exception(
           "Cannot construct a non-insertable (i.e. const key_type) "
           "unordered_map");
     }
@@ -742,10 +744,10 @@ class UnorderedMap {
   template <typename SKey, typename SValue>
   UnorderedMap(
       UnorderedMap<SKey, SValue, Device, Hasher, EqualTo> const &src,
-      typename std::enable_if<
+      std::enable_if_t<
           Impl::UnorderedMapCanAssign<declared_key_type, declared_value_type,
                                       SKey, SValue>::value,
-          int>::type = 0)
+          int> = 0)
       : m_bounded_insert(src.m_bounded_insert),
         m_hasher(src.m_hasher),
         m_equal_to(src.m_equal_to),
@@ -758,10 +760,10 @@ class UnorderedMap {
         m_scalars(src.m_scalars) {}
 
   template <typename SKey, typename SValue>
-  typename std::enable_if<
+  std::enable_if_t<
       Impl::UnorderedMapCanAssign<declared_key_type, declared_value_type, SKey,
                                   SValue>::value,
-      declared_map_type &>::type
+      declared_map_type &>
   operator=(UnorderedMap<SKey, SValue, Device, Hasher, EqualTo> const &src) {
     m_bounded_insert    = src.m_bounded_insert;
     m_hasher            = src.m_hasher;
@@ -777,10 +779,8 @@ class UnorderedMap {
   }
 
   template <typename SKey, typename SValue, typename SDevice>
-  typename std::enable_if<
-      std::is_same<typename std::remove_const<SKey>::type, key_type>::value &&
-      std::is_same<typename std::remove_const<SValue>::type,
-                   value_type>::value>::type
+  std::enable_if_t<std::is_same<std::remove_const_t<SKey>, key_type>::value &&
+                   std::is_same<std::remove_const_t<SValue>, value_type>::value>
   create_copy_view(
       UnorderedMap<SKey, SValue, SDevice, Hasher, EqualTo> const &src) {
     if (m_hash_lists.data() != src.m_hash_lists.data()) {
@@ -915,4 +915,8 @@ inline void deep_copy(
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_UNORDEREDMAP
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_UNORDEREDMAP
+#endif
 #endif  // KOKKOS_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
index eddb87800..8dd080737 100644
--- a/packages/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_VECTOR_HPP
 #define KOKKOS_VECTOR_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR
+#endif
 
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_DualView.hpp>
@@ -185,8 +189,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
  public:
   // TODO: can use detection idiom to generate better error message here later
   template <typename InputIterator>
-  typename std::enable_if<impl_is_input_iterator<InputIterator>::value,
-                          iterator>::type
+  std::enable_if_t<impl_is_input_iterator<InputIterator>::value, iterator>
   insert(iterator it, InputIterator b, InputIterator e) {
     ptrdiff_t count = std::distance(b, e);
 
@@ -333,4 +336,8 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
 };
 
 }  // namespace Kokkos
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR
+#endif
 #endif
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 9512f2d4a..134b30769 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -86,9 +86,7 @@ struct BitsetCount {
   void init(value_type& count) const { count = 0u; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& count, const volatile size_type& incr) const {
-    count += incr;
-  }
+  void join(value_type& count, const size_type& incr) const { count += incr; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(size_type i, value_type& count) const {
diff --git a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
index 9fb6a4e1c..b81b1eee1 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -56,11 +56,10 @@ template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
           typename SizeType>
 inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                                SizeType>::HostMirror
-create_mirror_view(
-    const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>&
-        view,
-    typename std::enable_if<ViewTraits<DataType, Arg1Type, Arg2Type,
-                                       Arg3Type>::is_hostspace>::type* = 0) {
+create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
+                                        SizeType>& view,
+                   std::enable_if_t<ViewTraits<DataType, Arg1Type, Arg2Type,
+                                               Arg3Type>::is_hostspace>* = 0) {
   return view;
 }
 
@@ -99,11 +98,10 @@ template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
           typename SizeType>
 inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                                SizeType>::HostMirror
-create_mirror_view(
-    const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>&
-        view,
-    typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type,
-                                        Arg3Type>::is_hostspace>::type* = 0) {
+create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
+                                        SizeType>& view,
+                   std::enable_if_t<!ViewTraits<DataType, Arg1Type, Arg2Type,
+                                                Arg3Type>::is_hostspace>* = 0) {
   return create_mirror(view);
 }
 }  // namespace Kokkos
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
index e10e256b6..fc861992f 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_UnorderedMap.hpp>
 
 namespace Kokkos {
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
index 80494139d..5acba244f 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -76,8 +76,15 @@ struct UnorderedMapRehash {
                  *this);
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(size_type i) const {
+  template <typename Dummy = typename map_type::value_type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_void<Dummy>::value>
+  operator()(size_type i) const {
+    if (m_src.valid_at(i)) m_dst.insert(m_src.key_at(i));
+  }
+
+  template <typename Dummy = typename map_type::value_type>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_void<Dummy>::value>
+  operator()(size_type i) const {
     if (m_src.valid_at(i)) m_dst.insert(m_src.key_at(i), m_src.value_at(i));
   }
 };
diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp
index 6810ae101..6cb03d6c5 100644
--- a/packages/kokkos/containers/unit_tests/TestBitset.hpp
+++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp
@@ -75,9 +75,7 @@ struct TestBitset {
   void init(value_type& v) const { v = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    dst += src;
-  }
+  void join(value_type& dst, const value_type& src) const { dst += src; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(uint32_t i, value_type& v) const {
@@ -116,9 +114,7 @@ struct TestBitsetTest {
   void init(value_type& v) const { v = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    dst += src;
-  }
+  void join(value_type& dst, const value_type& src) const { dst += src; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(uint32_t i, value_type& v) const {
@@ -148,9 +144,7 @@ struct TestBitsetAny {
   void init(value_type& v) const { v = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    dst += src;
-  }
+  void join(value_type& dst, const value_type& src) const { dst += src; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(uint32_t i, value_type& v) const {
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
index 75829e076..3085f091c 100644
--- a/packages/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -466,7 +466,8 @@ namespace {
  * that we keep the semantics of UVM DualViews intact.
  */
 // modify if we have other UVM enabled backends
-#ifdef KOKKOS_ENABLE_CUDA  // OR other UVM builds
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \
+    defined(KOKKOS_ENABLE_HIP)  // OR other UVM builds
 #define UVM_ENABLED_BUILD
 #endif
 
@@ -482,6 +483,20 @@ struct UVMSpaceFor<Kokkos::Cuda> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_SYCL  // specific to SYCL
+template <>
+struct UVMSpaceFor<Kokkos::Experimental::SYCL> {
+  using type = Kokkos::Experimental::SYCLSharedUSMSpace;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP  // specific to HIP
+template <>
+struct UVMSpaceFor<Kokkos::Experimental::HIP> {
+  using type = Kokkos::Experimental::HIPManagedSpace;
+};
+#endif
+
 #ifdef UVM_ENABLED_BUILD
 template <>
 struct UVMSpaceFor<Kokkos::DefaultHostExecutionSpace> {
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index 321f1228a..5fbd32956 100644
--- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <Kokkos_DynRankView.hpp>
@@ -108,8 +107,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -193,8 +191,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -275,8 +272,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -370,8 +366,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -445,8 +440,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -543,8 +537,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -623,8 +616,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
@@ -724,6 +716,7 @@ class TestDynViewAPI {
     run_test_subview_strided();
     run_test_vector();
     run_test_as_view_of_rank_n();
+    run_test_layout();
   }
 
   static void run_operator_test_rank12345() {
@@ -1158,9 +1151,6 @@ class TestDynViewAPI {
 #endif  // MDRangePolict Rank < 7
 
 #endif  // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-
-    // Error checking test
-    EXPECT_ANY_THROW({ auto v_copy = Kokkos::Impl::as_view_of_rank_n<2>(d); });
   }
 
   static void run_test_scalar() {
@@ -1898,6 +1888,28 @@ class TestDynViewAPI {
     typename smultivector_type::const_type cmvX(cmv);
     typename const_smultivector_type::const_type ccmvX(cmv);
   }
+
+  static void run_test_layout() {
+    Kokkos::DynRankView<double> d("source", 1, 2, 3, 4);
+    Kokkos::DynRankView<double> e("dest");
+
+    auto props = Kokkos::view_alloc(Kokkos::WithoutInitializing, d.label());
+    e          = Kokkos::DynRankView<double>(props, d.layout());
+
+    ASSERT_EQ(d.rank(), 4u);
+    ASSERT_EQ(e.rank(), 4u);
+    ASSERT_EQ(e.label(), "source");
+
+    auto ulayout = e.layout();
+    ASSERT_EQ(ulayout.dimension[0], 1u);
+    ASSERT_EQ(ulayout.dimension[1], 2u);
+    ASSERT_EQ(ulayout.dimension[2], 3u);
+    ASSERT_EQ(ulayout.dimension[3], 4u);
+    ASSERT_EQ(ulayout.dimension[4], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(ulayout.dimension[5], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(ulayout.dimension[6], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(ulayout.dimension[7], KOKKOS_INVALID_INDEX);
+  }
 };
 
 }  // namespace Test
diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
index a127c250e..9e9edc80b 100644
--- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -52,7 +52,6 @@
 #include <cstdio>
 #include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
-#include <KokkosExp_MDRangePolicy.hpp>
 
 using std::cout;
 using std::endl;
@@ -64,14 +63,19 @@ void test_offsetview_construction() {
   using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>;
   using view_type        = Kokkos::View<Scalar**, Device>;
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   Kokkos::Experimental::index_list_type range0 = {-1, 3};
   Kokkos::Experimental::index_list_type range1 = {-2, 2};
+#else
+  std::pair<int64_t, int64_t> range0 = {-1, 3};
+  std::pair<int64_t, int64_t> range1 = {-2, 2};
+#endif
 
   {
     offset_view_type o1;
     ASSERT_FALSE(o1.is_allocated());
 
-    o1 = offset_view_type("o1", range0, range1);
+    o1 = offset_view_type("o1", {-1, 3}, {-2, 2});
     offset_view_type o2(o1);
     offset_view_type o3("o3", range0, range1);
 
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
index 9fddfdcca..b2f5c5a91 100644
--- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -90,7 +90,7 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
     scatterSize = n;
     auto policy =
         Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n);
-    Kokkos::parallel_for(policy, *this, "scatter_view_test: Sum");
+    Kokkos::parallel_for("scatter_view_test: Sum", policy, *this);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -235,7 +235,7 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
     scatterSize = n;
     auto policy =
         Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n);
-    Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+    Kokkos::parallel_for("scatter_view_test: Prod", policy, *this);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -259,12 +259,10 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val0 = host_view(i, 0);
       auto val1 = host_view(i, 1);
       auto val2 = host_view(i, 2);
-      EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14)
+      EXPECT_NEAR(val0, 65536.0, 1e-14 * 65536.0)
           << "Data differs at index " << i;
+      EXPECT_NEAR(val1, 256.0, 1e-14 * 256.0) << "Data differs at index " << i;
+      EXPECT_NEAR(val2, 1.0, 1e-14 * 1.0) << "Data differs at index " << i;
     }
   }
 
@@ -282,9 +280,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val2 = host_view(i, 2);
       if (i >= std::get<0>(subRangeDim0) && i < std::get<1>(subRangeDim0)) {
         // is in subview
-        EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14);
-        EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14);
-        EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14);
+        EXPECT_NEAR(val0, 65536.0, 1e-14 * 65536.0);
+        EXPECT_NEAR(val1, 256.0, 1e-14 * 256.0);
+        EXPECT_NEAR(val2, 1.0, 1e-14 * 1.0);
       } else {
         // is outside of subview
         EXPECT_NEAR(val0, NumberType(1), 1e-14)
@@ -338,7 +336,7 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
     scatterSize = n;
     auto policy =
         Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n);
-    Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+    Kokkos::parallel_for("scatter_view_test: Prod", policy, *this);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -362,12 +360,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val0 = host_view(i, 0);
       auto val1 = host_view(i, 1);
       auto val2 = host_view(i, 2);
-      EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14)
-          << "Data differs at index " << i;
+      EXPECT_NEAR(val0, 4.0, 1e-14 * 4.0) << "Data differs at index " << i;
+      EXPECT_NEAR(val1, 2.0, 1e-14 * 2.0) << "Data differs at index " << i;
+      EXPECT_NEAR(val2, 1.0, 1e-14 * 1.0) << "Data differs at index " << i;
     }
   }
 
@@ -385,12 +380,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val2 = host_view(i, 2);
       if (i >= std::get<0>(subRangeDim0) && i < std::get<1>(subRangeDim0)) {
         // is in subview
-        EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14)
-            << "Data differs at index " << i;
-        EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14)
-            << "Data differs at index " << i;
-        EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14)
-            << "Data differs at index " << i;
+        EXPECT_NEAR(val0, 4.0, 1e-14 * 4.0) << "Data differs at index " << i;
+        EXPECT_NEAR(val1, 2.0, 1e-14 * 2.0) << "Data differs at index " << i;
+        EXPECT_NEAR(val2, 1.0, 1e-14 * 1.0) << "Data differs at index " << i;
       } else {
         // is outside of subview
         EXPECT_NEAR(val0, NumberType(999999), 1e-14)
@@ -443,7 +435,7 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
   void run_parallel(int n) {
     scatterSize = n;
     Kokkos::RangePolicy<typename DeviceType::execution_space, int> policy(0, n);
-    Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod");
+    Kokkos::parallel_for("scatter_view_test: Prod", policy, *this);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -467,12 +459,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val0 = host_view(i, 0);
       auto val1 = host_view(i, 1);
       auto val2 = host_view(i, 2);
-      EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14)
-          << "Data differs at index " << i;
-      EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14)
-          << "Data differs at index " << i;
+      EXPECT_NEAR(val0, 16.0, 1e-14 * 16.0) << "Data differs at index " << i;
+      EXPECT_NEAR(val1, 8.0, 1e-14 * 8.0) << "Data differs at index " << i;
+      EXPECT_NEAR(val2, 4.0, 1e-14 * 4.0) << "Data differs at index " << i;
     }
   }
 
@@ -490,12 +479,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       auto val2 = host_view(i, 2);
       if (i >= std::get<0>(subRangeDim0) && i < std::get<1>(subRangeDim0)) {
         // is in subview
-        EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14)
-            << "Data differs at index " << i;
-        EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14)
-            << "Data differs at index " << i;
-        EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14)
-            << "Data differs at index " << i;
+        EXPECT_NEAR(val0, 16.0, 1e-14 * 16.0) << "Data differs at index " << i;
+        EXPECT_NEAR(val1, 8.0, 1e-14 * 8.0) << "Data differs at index " << i;
+        EXPECT_NEAR(val2, 4.0, 1e-14 * 4.0) << "Data differs at index " << i;
       } else {
         // is outside of subview
         EXPECT_NEAR(val0, NumberType(0), 1e-14)
@@ -888,7 +874,7 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
 #else
   using device_execution_space = Kokkos::Experimental::HIP;
   using device_memory_space    = Kokkos::Experimental::HIPSpace;
-  using host_accessible_space  = Kokkos::Experimental::HIPHostPinnedSpace;
+  using host_accessible_space  = Kokkos::Experimental::HIPManagedSpace;
 #endif
   if (std::is_same<TEST_EXECSPACE, device_execution_space>::value) {
     using device_device_type =
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 1550ca7b5..3f5f97d6b 100644
--- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -87,8 +87,7 @@ struct TestInsert {
   void init(value_type &failed_count) const { failed_count = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &failed_count,
-            const volatile value_type &count) const {
+  void join(value_type &failed_count, const value_type &count) const {
     failed_count += count;
   }
 
@@ -156,9 +155,7 @@ struct TestFind {
   static void init(value_type &dst) { dst = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &dst, const volatile value_type &src) {
-    dst += src;
-  }
+  static void join(value_type &dst, const value_type &src) { dst += src; }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(typename execution_space::size_type i,
@@ -337,6 +334,9 @@ TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
   m.insert(5);
   m.insert(7);
   ASSERT_EQ(4u, m.size());
+  m.rehash(0);
+  ASSERT_EQ(128u, m.capacity());
+  ASSERT_EQ(4u, m.size());
 
   m.clear();
   ASSERT_EQ(0u, m.size());
diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp
index c093c7b0c..efb21fe13 100644
--- a/packages/kokkos/containers/unit_tests/TestVector.hpp
+++ b/packages/kokkos/containers/unit_tests/TestVector.hpp
@@ -298,6 +298,19 @@ TEST(TEST_CATEGORY, vector_insert) {
   Impl::test_vector_insert<int, TEST_EXECSPACE>(3057);
 }
 
+// The particular scenario below triggered a bug where empty modified_flags
+// would cause resize in push_back to be executed on the device overwriting the
+// values that were stored on the host previously.
+TEST(TEST_CATEGORY, vector_push_back_default_exec) {
+  Kokkos::vector<int, TEST_EXECSPACE> V;
+  V.clear();
+  V.push_back(4);
+  ASSERT_EQ(V[0], 4);
+  V.push_back(3);
+  ASSERT_EQ(V[1], 3);
+  ASSERT_EQ(V[0], 4);
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
index feae32179..174773f19 100644
--- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
+++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
@@ -45,7 +45,9 @@
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
+#include <Kokkos_DynamicView.hpp>
 #include <Kokkos_DynRankView.hpp>
+#include <Kokkos_OffsetView.hpp>
 #include <Kokkos_ScatterView.hpp>
 
 #include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp>
@@ -59,7 +61,12 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
   auto success = validate_absence(
       [&]() {
         Kokkos::resize(Kokkos::WithoutInitializing, bla, 5, 6, 7, 9);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 8, 8, 8, 8);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
+        Kokkos::realloc(Kokkos::view_alloc(Kokkos::WithoutInitializing), bla, 5,
+                        6, 7, 8);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
       },
       [&](BeginParallelForEvent event) {
         if (event.descriptor().find("initialization") != std::string::npos)
@@ -85,7 +92,9 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) {
   auto success = validate_absence(
       [&]() {
         Kokkos::resize(bla, 8, 7, 6, 5);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 8, 7, 6, 5);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
       },
       [&](BeginParallelForEvent) {
         return MatchDiagnostic{true, {"Found begin event"}};
@@ -103,6 +112,74 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) {
   listen_tool_events(Config::DisableAll());
 }
 
+TEST(TEST_CATEGORY, resize_exec_space_dualview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences(),
+                     Config::EnableKernels());
+  Kokkos::DualView<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6,
+                                                              5);
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::resize(
+            Kokkos::view_alloc(TEST_EXECSPACE{}, Kokkos::WithoutInitializing),
+            bla, 5, 6, 7, 8);
+        EXPECT_EQ(bla.template view<TEST_EXECSPACE>().label(), "bla");
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](BeginParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, realloc_exec_space_dualview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::DualView<int*, TEST_EXECSPACE>;
+  view_type v(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::realloc(Kokkos::view_alloc(TEST_EXECSPACE{}), v, 8);
+        EXPECT_EQ(v.template view<TEST_EXECSPACE>().label(), "bla");
+      },
+      [&](BeginFenceEvent event) {
+        if ((event.descriptor().find("Debug Only Check for Execution Error") !=
+             std::string::npos) ||
+            (event.descriptor().find("HostSpace fence") != std::string::npos))
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event!"}};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
 TEST(TEST_CATEGORY, resize_realloc_no_init_dynrankview) {
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@@ -111,7 +188,51 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dynrankview) {
   auto success = validate_absence(
       [&]() {
         Kokkos::resize(Kokkos::WithoutInitializing, bla, 5, 6, 7, 9);
+        EXPECT_EQ(bla.label(), "bla");
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 8, 8, 8, 8);
+        EXPECT_EQ(bla.label(), "bla");
+        Kokkos::realloc(Kokkos::view_alloc(Kokkos::WithoutInitializing), bla, 5,
+                        6, 7, 8);
+        EXPECT_EQ(bla.label(), "bla");
+      },
+      [&](BeginParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, resize_exec_space_dynrankview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences(),
+                     Config::EnableKernels());
+  Kokkos::DynRankView<int, TEST_EXECSPACE> bla("bla", 8, 7, 6, 5);
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::resize(
+            Kokkos::view_alloc(TEST_EXECSPACE{}, Kokkos::WithoutInitializing),
+            bla, 5, 6, 7, 8);
+        EXPECT_EQ(bla.label(), "bla");
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
       },
       [&](BeginParallelForEvent event) {
         if (event.descriptor().find("initialization") != std::string::npos)
@@ -127,6 +248,45 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dynrankview) {
   listen_tool_events(Config::DisableAll());
 }
 
+TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
+#endif
+// FIXME_THREADS The Threads backend fences every parallel_for
+#ifdef KOKKOS_ENABLE_THREADS
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Threads>::value)
+    GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::DynRankView<int, TEST_EXECSPACE>;
+  view_type outer_view, outer_view2;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+        Kokkos::realloc(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, TEST_EXECSPACE{}),
+            inner_view, 10);
+        EXPECT_EQ(inner_view.label(), "bla");
+        outer_view2 = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        if ((event.descriptor().find("Debug Only Check for Execution Error") !=
+             std::string::npos) ||
+            (event.descriptor().find("HostSpace fence") != std::string::npos))
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event!"}};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
 TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) {
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@@ -137,7 +297,12 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) {
   auto success = validate_absence(
       [&]() {
         Kokkos::resize(Kokkos::WithoutInitializing, bla, 4, 5, 6, 8);
+        EXPECT_EQ(bla.subview().label(), "bla");
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 8, 8, 8, 8);
+        EXPECT_EQ(bla.subview().label(), "bla");
+        Kokkos::realloc(Kokkos::view_alloc(Kokkos::WithoutInitializing), bla, 5,
+                        6, 7, 8);
+        EXPECT_EQ(bla.subview().label(), "bla");
       },
       [&](BeginParallelForEvent event) {
         if (event.descriptor().find("initialization") != std::string::npos)
@@ -164,7 +329,9 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) {
   auto success = validate_absence(
       [&]() {
         Kokkos::resize(bla, 7, 6, 5, 4);
+        EXPECT_EQ(bla.subview().label(), "bla");
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 7, 6, 5, 4);
+        EXPECT_EQ(bla.subview().label(), "bla");
       },
       [&](BeginParallelForEvent) {
         return MatchDiagnostic{true, {"Found begin event"}};
@@ -181,3 +348,388 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) {
   ASSERT_TRUE(success);
   listen_tool_events(Config::DisableAll());
 }
+
+TEST(TEST_CATEGORY, resize_exec_space_scatterview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences(),
+                     Config::EnableKernels());
+  Kokkos::Experimental::ScatterView<
+      int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE>
+      bla("bla", 7, 6, 5, 4);
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::resize(
+            Kokkos::view_alloc(TEST_EXECSPACE{}, Kokkos::WithoutInitializing),
+            bla, 5, 6, 7, 8);
+        EXPECT_EQ(bla.subview().label(), "bla");
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](BeginParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, realloc_exec_space_scatterview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
+#endif
+// FIXME_THREADS The Threads backend fences every parallel_for
+#ifdef KOKKOS_ENABLE_THREADS
+  if (std::is_same<typename TEST_EXECSPACE, Kokkos::Threads>::value)
+    GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::Experimental::ScatterView<
+      int*, typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE>;
+  view_type outer_view, outer_view2;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+        Kokkos::realloc(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, TEST_EXECSPACE{}),
+            inner_view, 10);
+        EXPECT_EQ(inner_view.subview().label(), "bla");
+        outer_view2 = inner_view;
+        Kokkos::realloc(Kokkos::view_alloc(TEST_EXECSPACE{}), inner_view, 10);
+        EXPECT_EQ(inner_view.subview().label(), "bla");
+      },
+      [&](BeginFenceEvent event) {
+        if ((event.descriptor().find("Debug Only Check for Execution Error") !=
+             std::string::npos) ||
+            (event.descriptor().find("HostSpace fence") != std::string::npos))
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event!"}};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::DynRankView<int, TEST_EXECSPACE> device_view("device view", 10);
+  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device =
+            Kokkos::create_mirror(Kokkos::WithoutInitializing, device_view);
+        auto mirror_host = Kokkos::create_mirror(Kokkos::WithoutInitializing,
+                                                 TEST_EXECSPACE{}, host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, TEST_EXECSPACE{}, host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview_viewctor) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
+      "device view", 10);
+  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynrankview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    return;
+#endif
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
+                     Config::EnableFences());
+
+  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
+  decltype(Kokkos::create_mirror_view_and_copy(TEST_EXECSPACE{},
+                                               host_view)) device_view;
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(TEST_EXECSPACE{},
+                               typename TEST_EXECSPACE::memory_space{}),
+            host_view);
+        // Avoid fences for deallocation when mirror_device goes out of scope.
+        device_view = mirror_device;
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found parallel_for event"}};
+      },
+      [&](BeginFenceEvent) {
+        return MatchDiagnostic{true, {"Found fence event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_no_init_offsetview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::Experimental::OffsetView<int*, TEST_EXECSPACE> device_view(
+      "device view", {0, 10});
+  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
+      "host view", {0, 10});
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device =
+            Kokkos::create_mirror(Kokkos::WithoutInitializing, device_view);
+        auto mirror_host = Kokkos::create_mirror(Kokkos::WithoutInitializing,
+                                                 TEST_EXECSPACE{}, host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, TEST_EXECSPACE{}, host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_no_init_offsetview_view_ctor) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", {0, 10});
+  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
+      "host view", {0, 10});
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    return;
+#endif
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
+                     Config::EnableFences());
+
+  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
+      "host view", {0, 10});
+  decltype(Kokkos::create_mirror_view_and_copy(TEST_EXECSPACE{},
+                                               host_view)) device_view;
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(TEST_EXECSPACE{},
+                               typename TEST_EXECSPACE::memory_space{}),
+            host_view);
+        // Avoid fences for deallocation when mirror_device goes out of scope.
+        device_view               = mirror_device;
+        auto mirror_device_mirror = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(TEST_EXECSPACE{},
+                               typename TEST_EXECSPACE::memory_space{}),
+            mirror_device);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found parallel_for event"}};
+      },
+      [&](BeginFenceEvent) {
+        return MatchDiagnostic{true, {"Found fence event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+// FIXME OPENMPTARGET
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::Experimental::DynamicView<int*, TEST_EXECSPACE> device_view(
+      "device view", 2, 10);
+  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
+      "host view", 2, 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device =
+            Kokkos::create_mirror(Kokkos::WithoutInitializing, device_view);
+        auto mirror_host = Kokkos::create_mirror(Kokkos::WithoutInitializing,
+                                                 TEST_EXECSPACE{}, host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::WithoutInitializing, TEST_EXECSPACE{}, host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    return;
+#endif
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
+                     Config::EnableFences());
+
+  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
+      "host view", 2, 10);
+  decltype(Kokkos::create_mirror_view_and_copy(TEST_EXECSPACE{},
+                                               host_view)) device_view;
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(TEST_EXECSPACE{},
+                               typename TEST_EXECSPACE::memory_space{}),
+            host_view);
+        // Avoid fences for deallocation when mirror_device goes out of scope.
+        device_view               = mirror_device;
+        auto mirror_device_mirror = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(TEST_EXECSPACE{},
+                               typename TEST_EXECSPACE::memory_space{}),
+            mirror_device);
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("DynamicView::resize_serial: Fence after "
+                                    "copying chunks to the device") !=
+            std::string::npos)
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event"}};
+      },
+      [&](EndFenceEvent) { return MatchDiagnostic{false}; },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found parallel_for event"}};
+      });
+  ASSERT_TRUE(success);
+}
+#endif
+
+// FIXME OPENMPTARGET
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview_view_ctor) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", 2, 10);
+  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
+      "host view", 2, 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("DynamicView::resize_serial: Fence after "
+                                    "copying chunks to the device") !=
+            std::string::npos)
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event"}};
+      },
+      [&](EndFenceEvent) { return MatchDiagnostic{false}; },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+#endif
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
index a7c57a943..7ba97dbfb 100644
--- a/packages/kokkos/core/perf_test/CMakeLists.txt
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -96,11 +96,14 @@ IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA)
   )
 ENDIF()
 
+# FIXME_NVHPC
+IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
   PerformanceTest_Mempool
   SOURCES test_mempool.cpp
   CATEGORIES PERFORMANCE
 )
+ENDIF()
 
 IF(NOT Kokkos_ENABLE_OPENMPTARGET)
 # FIXME OPENMPTARGET needs tasking
diff --git a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
index e133dafa3..5be29e65d 100644
--- a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
+++ b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@@ -72,8 +72,7 @@ struct Dot {
   void operator()(int i, value_type& update) const { update += X[i] * Y[i]; }
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& source) {
+  static void join(value_type& update, const value_type& source) {
     update += source;
   }
 
@@ -105,8 +104,7 @@ struct DotSingle {
   }
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& source) {
+  static void join(value_type& update, const value_type& source) {
     update += source;
   }
 
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index b534c32c5..31a01184c 100644
--- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -69,7 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
 
   KOKKOS_INLINE_FUNCTION
   void final(value_type& result) const {
-    result = Kokkos::Experimental::sqrt(result);
+    result = Kokkos::sqrt(result);
     Rjj()  = result;
     inv()  = (0 < result) ? 1.0 / result : 0;
   }
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index 793e07a84..684ea353a 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -1,17 +1,19 @@
-#I have to leave these here for tribits
 KOKKOS_INCLUDE_DIRECTORIES(
   ${CMAKE_CURRENT_BINARY_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${KOKKOS_TOP_BUILD_DIR}
 )
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
+  KOKKOS_INCLUDE_DIRECTORIES(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include
+  )
+ENDIF()
+
 
 INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
   FILES_MATCHING
-  PATTERN desul/src EXCLUDE
-  PATTERN "*.inc"
-  PATTERN "*.inc_*"
   PATTERN "*.hpp"
   PATTERN "*.h"
 )
@@ -37,6 +39,11 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp)
 ENDIF()
 
+IF (KOKKOS_ENABLE_OPENACC)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp)
+ENDIF()
+
 IF (KOKKOS_ENABLE_THREADS)
   APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp)
@@ -57,10 +64,8 @@ IF (NOT KOKKOS_ENABLE_MEMKIND)
 ENDIF()
 
 IF (KOKKOS_ENABLE_SERIAL)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp)
-ELSE()
-  LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial.cpp)
-  LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp)
 ENDIF()
 
 IF (KOKKOS_ENABLE_SYCL)
@@ -68,12 +73,25 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
-IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
-  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
-  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
-  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
-  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
-  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*)
+
+  INSTALL (DIRECTORY
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul"
+    DESTINATION ${KOKKOS_HEADER_DIR}
+    FILES_MATCHING
+    PATTERN "*.inc"
+    PATTERN "*.inc_*"
+    PATTERN "*.hpp"
+  )
+  MESSAGE(STATUS "Using internal desul_atomics copy")
+ELSE()
+  MESSAGE(STATUS "Using external desul_atomics install found at:")
+  MESSAGE(STATUS "  " ${desul_DIR})
 ENDIF()
 
 
@@ -89,6 +107,11 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore
   ${CMAKE_CURRENT_BINARY_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}
 )
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND NOT desul_FOUND)
+  KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include
+  )
+ENDIF()
 
 KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND)
@@ -108,10 +131,13 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
 #        libatomic
 # XL requires libatomic even for 64 bit CAS, most others only for 128
 # I (CT) had removed 128bit CAS from desul to not need libatomic.
-IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND
     (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
   target_link_libraries(kokkoscore PUBLIC atomic)
 ENDIF()
 
+IF (Kokkos_ENABLE_IMPL_DESUL_ATOMICS AND desul_FOUND)
+  target_link_libraries(kokkoscore PUBLIC desul_atomics)
+ENDIF()
 
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 31601944b..b2161bc1f 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA
 
@@ -52,7 +56,6 @@
 #include <cstdlib>
 #include <iostream>
 #include <sstream>
-#include <stdexcept>
 #include <algorithm>
 #include <atomic>
 
@@ -199,14 +202,22 @@ void *CudaSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
 }
 
+void *CudaSpace::allocate(const Cuda &exec_space, const char *arg_label,
+                          const size_t arg_alloc_size,
+                          const size_t arg_logical_size) const {
+  return impl_allocate(exec_space, arg_label, arg_alloc_size, arg_logical_size);
+}
 void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
                           const size_t arg_logical_size) const {
   return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
 }
-void *CudaSpace::impl_allocate(
-    const char *arg_label, const size_t arg_alloc_size,
-    const size_t arg_logical_size,
-    const Kokkos::Tools::SpaceHandle arg_handle) const {
+
+namespace {
+void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
+                           const size_t arg_alloc_size,
+                           const size_t arg_logical_size,
+                           const Kokkos::Tools::SpaceHandle arg_handle,
+                           bool exec_space_provided) {
   void *ptr = nullptr;
 
 #ifndef CUDART_VERSION
@@ -214,12 +225,20 @@ void *CudaSpace::impl_allocate(
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
   cudaError_t error_code;
   if (arg_alloc_size >= memory_threshold_g) {
-    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    if (exec_space_provided) {
+      cudaStream_t stream = exec_space.cuda_stream();
+      error_code          = cudaMallocAsync(&ptr, arg_alloc_size, stream);
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+    } else {
+      error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    }
   } else {
     error_code = cudaMalloc(&ptr, arg_alloc_size);
   }
 #else
+  (void)exec_space;
+  (void)exec_space_provided;
   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
 #endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
@@ -239,6 +258,23 @@ void *CudaSpace::impl_allocate(
   }
   return ptr;
 }
+}  // namespace
+
+void *CudaSpace::impl_allocate(
+    const char *arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size,
+                              arg_logical_size, arg_handle, false);
+}
+
+void *CudaSpace::impl_allocate(
+    const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  return impl_allocate_common(exec_space, arg_label, arg_alloc_size,
+                              arg_logical_size, arg_handle, true);
+}
 
 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
@@ -493,6 +529,17 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
                      alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
 }
 
+void SharedAllocationRecord<Kokkos::CudaSpace, void>::deep_copy_header_no_exec(
+    void *ptr, const void *header) {
+  Kokkos::Cuda exec;
+  Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(exec, ptr, header,
+                                               sizeof(SharedAllocationHeader));
+  exec.fence(
+      "SharedAllocationRecord<Kokkos::CudaSpace, "
+      "void>::SharedAllocationRecord(): fence after copying header from "
+      "HostSpace");
+}
+
 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
   m_space.deallocate(m_label.c_str(),
                      SharedAllocationRecord<void, void>::m_alloc_ptr,
@@ -547,6 +594,33 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
       "HostSpace");
 }
 
+SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
+    const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space,
+    const std::string &arg_label, const size_t arg_alloc_size,
+    const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_exec_space, arg_space,
+                                               arg_label, arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_tex_obj(0),
+      m_space(arg_space) {
+
+  SharedAllocationHeader header;
+
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(arg_exec_space,
+                                               RecordBase::m_alloc_ptr, &header,
+                                               sizeof(SharedAllocationHeader));
+}
+
 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
     const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label,
     const size_t arg_alloc_size,
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
index 8a6c0433c..8e8dff677 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp
@@ -464,20 +464,19 @@ inline __device__ int __stronger_order_simt_(int a, int b) {
     base
 */
 
-#define DO__atomic_load_simt_(bytes, bits)                                 \
-  template <class type,                                                    \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \
-  void __device__ __atomic_load_simt_(const type *ptr, type *ret,          \
-                                      int memorder) {                      \
-    int##bits##_t tmp = 0;                                                 \
-    switch (memorder) {                                                    \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                           \
-      case __ATOMIC_CONSUME:                                               \
-      case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break;  \
-      case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break;  \
-      default: assert(0);                                                  \
-    }                                                                      \
-    memcpy(ret, &tmp, bytes);                                              \
+#define DO__atomic_load_simt_(bytes, bits)                                \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0> \
+  void __device__ __atomic_load_simt_(const type *ptr, type *ret,         \
+                                      int memorder) {                     \
+    int##bits##_t tmp = 0;                                                \
+    switch (memorder) {                                                   \
+      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                          \
+      case __ATOMIC_CONSUME:                                              \
+      case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \
+      case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \
+      default: assert(0);                                                 \
+    }                                                                     \
+    memcpy(ret, &tmp, bytes);                                             \
   }
 DO__atomic_load_simt_(1, 32) DO__atomic_load_simt_(2, 16)
     DO__atomic_load_simt_(4, 32) DO__atomic_load_simt_(8, 64)
@@ -490,8 +489,7 @@ DO__atomic_load_simt_(1, 32) DO__atomic_load_simt_(2, 16)
 }
 
 #define DO__atomic_store_simt_(bytes, bits)                                  \
-  template <class type,                                                      \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>   \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>    \
   void __device__ __atomic_store_simt_(type *ptr, type *val, int memorder) { \
     int##bits##_t tmp = 0;                                                   \
     memcpy(&tmp, val, bytes);                                                \
@@ -511,49 +509,47 @@ DO__atomic_store_simt_(1, 32) DO__atomic_store_simt_(2, 16)
   __atomic_store_simt_(ptr, &val, memorder);
 }
 
-#define DO__atomic_compare_exchange_simt_(bytes, bits)                     \
-  template <class type,                                                    \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \
-  bool __device__ __atomic_compare_exchange_simt_(                         \
-      type *ptr, type *expected, const type *desired, bool,                \
-      int success_memorder, int failure_memorder) {                        \
-    int##bits##_t tmp = 0, old = 0, old_tmp;                               \
-    memcpy(&tmp, desired, bytes);                                          \
-    memcpy(&old, expected, bytes);                                         \
-    old_tmp = old;                                                         \
-    switch (__stronger_order_simt_(success_memorder, failure_memorder)) {  \
-      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                           \
-      case __ATOMIC_CONSUME:                                               \
-      case __ATOMIC_ACQUIRE:                                               \
-        __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp);                 \
-        break;                                                             \
-      case __ATOMIC_ACQ_REL:                                               \
-        __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp);                 \
-        break;                                                             \
-      case __ATOMIC_RELEASE:                                               \
-        __simt_cas_release_##bits(ptr, old, old_tmp, tmp);                 \
-        break;                                                             \
-      case __ATOMIC_RELAXED:                                               \
-        __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp);                 \
-        break;                                                             \
-      default: assert(0);                                                  \
-    }                                                                      \
-    bool const ret = old == old_tmp;                                       \
-    memcpy(expected, &old, bytes);                                         \
-    return ret;                                                            \
+#define DO__atomic_compare_exchange_simt_(bytes, bits)                    \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0> \
+  bool __device__ __atomic_compare_exchange_simt_(                        \
+      type *ptr, type *expected, const type *desired, bool,               \
+      int success_memorder, int failure_memorder) {                       \
+    int##bits##_t tmp = 0, old = 0, old_tmp;                              \
+    memcpy(&tmp, desired, bytes);                                         \
+    memcpy(&old, expected, bytes);                                        \
+    old_tmp = old;                                                        \
+    switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \
+      case __ATOMIC_SEQ_CST: __simt_fence_sc_();                          \
+      case __ATOMIC_CONSUME:                                              \
+      case __ATOMIC_ACQUIRE:                                              \
+        __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp);                \
+        break;                                                            \
+      case __ATOMIC_ACQ_REL:                                              \
+        __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp);                \
+        break;                                                            \
+      case __ATOMIC_RELEASE:                                              \
+        __simt_cas_release_##bits(ptr, old, old_tmp, tmp);                \
+        break;                                                            \
+      case __ATOMIC_RELAXED:                                              \
+        __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp);                \
+        break;                                                            \
+      default: assert(0);                                                 \
+    }                                                                     \
+    bool const ret = old == old_tmp;                                      \
+    memcpy(expected, &old, bytes);                                        \
+    return ret;                                                           \
   }
 DO__atomic_compare_exchange_simt_(4, 32)
     DO__atomic_compare_exchange_simt_(8, 64)
 
-        template <class type,
-                  typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+        template <class type, std::enable_if_t<sizeof(type) <= 2, int> = 0>
         bool __device__
     __atomic_compare_exchange_simt_(type *ptr, type *expected,
                                     const type *desired, bool,
                                     int success_memorder,
                                     int failure_memorder) {
-  using R            = typename std::conditional<std::is_volatile<type>::value,
-                                      volatile uint32_t, uint32_t>::type;
+  using R = std::conditional_t<std::is_volatile<type>::value, volatile uint32_t,
+                               uint32_t>;
   auto const aligned = (R *)((intptr_t)ptr & ~(sizeof(uint32_t) - 1));
   auto const offset  = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8;
   auto const mask    = ((1 << sizeof(type) * 8) - 1) << offset;
@@ -581,8 +577,7 @@ bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected,
 }
 
 #define DO__atomic_exchange_simt_(bytes, bits)                                 \
-  template <class type,                                                        \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>     \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>      \
   void __device__ __atomic_exchange_simt_(type *ptr, type *val, type *ret,     \
                                           int memorder) {                      \
     int##bits##_t tmp = 0;                                                     \
@@ -600,8 +595,7 @@ bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected,
   }
 DO__atomic_exchange_simt_(4, 32) DO__atomic_exchange_simt_(8, 64)
 
-    template <class type,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+    template <class type, std::enable_if_t<sizeof(type) <= 2, int> = 0>
     void __device__
     __atomic_exchange_simt_(type *ptr, type *val, type *ret, int memorder) {
   type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
@@ -620,7 +614,7 @@ type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
 
 #define DO__atomic_fetch_add_simt_(bytes, bits)                               \
   template <class type, class delta,                                          \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>    \
+            std::enable_if_t<sizeof(type) == bytes, int> = 0>                 \
   type __device__ __atomic_fetch_add_simt_(type *ptr, delta val,              \
                                            int memorder) {                    \
     type ret;                                                                 \
@@ -638,7 +632,7 @@ type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) {
 DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64)
 
     template <class type, class delta,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+              std::enable_if_t<sizeof(type) <= 2, int> = 0>
     type __device__
     __atomic_fetch_add_simt_(type *ptr, delta val, int memorder) {
   type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
@@ -651,7 +645,7 @@ DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64)
 
 #define DO__atomic_fetch_sub_simt_(bytes, bits)                                \
   template <class type, class delta,                                           \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>     \
+            std::enable_if_t<sizeof(type) == bytes, int> = 0>                  \
   type __device__ __atomic_fetch_sub_simt_(type *ptr, delta val,               \
                                            int memorder) {                     \
     type ret;                                                                  \
@@ -669,7 +663,7 @@ DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64)
 DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64)
 
     template <class type, class delta,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+              std::enable_if_t<sizeof(type) <= 2, int> = 0>
     type __device__
     __atomic_fetch_sub_simt_(type *ptr, delta val, int memorder) {
   type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
@@ -681,8 +675,7 @@ DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64)
 }
 
 #define DO__atomic_fetch_and_simt_(bytes, bits)                               \
-  template <class type,                                                       \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>    \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>     \
   type __device__ __atomic_fetch_and_simt_(type *ptr, type val,               \
                                            int memorder) {                    \
     type ret;                                                                 \
@@ -700,7 +693,7 @@ DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64)
 DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64)
 
     template <class type, class delta,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+              std::enable_if_t<sizeof(type) <= 2, int> = 0>
     type __device__
     __atomic_fetch_and_simt_(type *ptr, delta val, int memorder) {
   type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
@@ -712,8 +705,7 @@ DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64)
 }
 
 #define DO__atomic_fetch_xor_simt_(bytes, bits)                               \
-  template <class type,                                                       \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>    \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>     \
   type __device__ __atomic_fetch_xor_simt_(type *ptr, type val,               \
                                            int memorder) {                    \
     type ret;                                                                 \
@@ -731,7 +723,7 @@ DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64)
 DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64)
 
     template <class type, class delta,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+              std::enable_if_t<sizeof(type) <= 2, int> = 0>
     type __device__
     __atomic_fetch_xor_simt_(type *ptr, delta val, int memorder) {
   type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
@@ -743,8 +735,7 @@ DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64)
 }
 
 #define DO__atomic_fetch_or_simt_(bytes, bits)                                 \
-  template <class type,                                                        \
-            typename std::enable_if<sizeof(type) == bytes, int>::type = 0>     \
+  template <class type, std::enable_if_t<sizeof(type) == bytes, int> = 0>      \
   type __device__ __atomic_fetch_or_simt_(type *ptr, type val, int memorder) { \
     type ret;                                                                  \
     switch (memorder) {                                                        \
@@ -761,7 +752,7 @@ DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64)
 DO__atomic_fetch_or_simt_(4, 32) DO__atomic_fetch_or_simt_(8, 64)
 
     template <class type, class delta,
-              typename std::enable_if<sizeof(type) <= 2, int>::type = 0>
+              std::enable_if_t<sizeof(type) <= 2, int> = 0>
     type __device__
     __atomic_fetch_or_simt_(type *ptr, delta val, int memorder) {
   type expected      = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED);
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
index 36df0d256..e28e964d3 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -63,14 +63,37 @@ void cuda_device_synchronize(const std::string& name);
 void cuda_stream_synchronize(const cudaStream_t stream,
                              const std::string& name);
 
-void cuda_internal_error_throw(cudaError e, const char* name,
-                               const char* file = nullptr, const int line = 0);
+[[noreturn]] void cuda_internal_error_throw(cudaError e, const char* name,
+                                            const char* file = nullptr,
+                                            const int line   = 0);
+
+#ifndef KOKKOS_COMPILER_NVHPC
+[[noreturn]]
+#endif
+             void cuda_internal_error_abort(cudaError e, const char* name,
+                                            const char* file = nullptr,
+                                            const int line   = 0);
 
 inline void cuda_internal_safe_call(cudaError e, const char* name,
                                     const char* file = nullptr,
                                     const int line   = 0) {
-  if (cudaSuccess != e) {
-    cuda_internal_error_throw(e, name, file, line);
+  // 1. Success -> normal continuation.
+  // 2. Error codes for which, to continue using CUDA, the process must be
+  //    terminated and relaunched -> call abort on the host-side.
+  // 3. Any other error code -> throw a runtime error.
+  switch (e) {
+    case cudaSuccess: break;
+    case cudaErrorIllegalAddress:
+    case cudaErrorAssert:
+    case cudaErrorHardwareStackError:
+    case cudaErrorIllegalInstruction:
+    case cudaErrorMisalignedAddress:
+    case cudaErrorInvalidAddressSpace:
+    case cudaErrorInvalidPc:
+    case cudaErrorLaunchFailure:
+      cuda_internal_error_abort(e, name, file, line);
+      break;
+    default: cuda_internal_error_throw(e, name, file, line); break;
   }
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index e8a764134..40a263561 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -80,56 +80,38 @@ half_t cast_to_half(double val) {
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(short val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__short2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__short2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(unsigned short val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__ushort2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__ushort2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(int val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__int2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__int2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(unsigned int val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__uint2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__uint2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(long long val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__ll2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__ll2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
 half_t cast_to_half(unsigned long long val) {
-#ifdef __CUDA_ARCH__
-  return half_t(__ull2half_rn(val));
-#else
-  return half_t(__float2half(static_cast<float>(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return half_t(__ull2half_rn(val));))
+  KOKKOS_IF_ON_HOST((return half_t(__float2half(static_cast<float>(val)));))
 }
 
 KOKKOS_INLINE_FUNCTION
@@ -163,62 +145,50 @@ cast_from_half(half_t val) {
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T>
 cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2short_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2short_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 
 template <class T>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned short>::value, T>
     cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2ushort_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2ushort_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T>
 cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2int_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2int_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned>::value, T>
 cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2uint_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2uint_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 
 template <class T>
 KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T>
 cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2ll_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2ll_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 
 template <class T>
 KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
     cast_from_half(half_t val) {
-#ifdef __CUDA_ARCH__
-  return __half2ull_rz(half_t::impl_type(val));
-#else
-  return static_cast<T>(__half2float(half_t::impl_type(val)));
-#endif
+  KOKKOS_IF_ON_DEVICE((return __half2ull_rz(half_t::impl_type(val));))
+  KOKKOS_IF_ON_HOST(
+      (return static_cast<T>(__half2float(half_t::impl_type(val)));))
 }
 
 template <class T>
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index aaa9ea8ad..5811498e0 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -45,6 +45,10 @@
 /*--------------------------------------------------------------------------*/
 /* Kokkos interfaces */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA
 
@@ -57,6 +61,8 @@
 #include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
@@ -107,11 +113,15 @@ namespace Impl {
 namespace {
 
 __global__ void query_cuda_kernel_arch(int *d_arch) {
+#ifdef _NVHPC_CUDA
+  *d_arch = __builtin_current_device_sm() * 10;
+#else
 #if defined(__CUDA_ARCH__)
   *d_arch = __CUDA_ARCH__;
 #else
   *d_arch = 0;
 #endif
+#endif
 }
 
 /** Query what compute capability is actually launched to the device: */
@@ -184,6 +194,17 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
   throw_runtime_exception(out.str());
 }
 
+void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
+                               const int line) {
+  std::ostringstream out;
+  out << name << " error( " << cudaGetErrorName(e)
+      << "): " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  abort(out.str().c_str());
+}
+
 //----------------------------------------------------------------------------
 // Some significant cuda device properties:
 //
@@ -331,8 +352,9 @@ CudaInternal::~CudaInternal() {
 
 int CudaInternal::verify_is_initialized(const char *const label) const {
   if (m_cudaDev < 0) {
-    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized"
-              << std::endl;
+    Kokkos::abort((std::string("Kokkos::Cuda::") + label +
+                   " : ERROR device not initialized\n")
+                      .c_str());
   }
   return 0 <= m_cudaDev;
 }
@@ -716,13 +738,22 @@ void CudaInternal::finalize() {
   if (was_finalized) return;
 
   was_finalized = true;
-  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    // Only finalize this if we're the singleton
-    if (this == &singleton()) {
-      (void)Impl::cuda_global_unique_token_locks(true);
-      Impl::finalize_host_cuda_lock_arrays();
-    }
 
+  // Only finalize this if we're the singleton
+  if (this == &singleton()) {
+    (void)Impl::cuda_global_unique_token_locks(true);
+    Impl::finalize_host_cuda_lock_arrays();
+
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
+    auto &deep_copy_space =
+        Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
+    if (deep_copy_space)
+      deep_copy_space->impl_internal_space_instance()->finalize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
+  }
+
+  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
     using RecordHost =
         Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;
@@ -732,47 +763,36 @@ void CudaInternal::finalize() {
     RecordHost::decrement(RecordHost::get_record(m_scratchUnified));
     if (m_scratchFunctorSize > 0)
       RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
+  }
 
-    for (int i = 0; i < m_n_team_scratch; ++i) {
-      if (m_team_scratch_current_size[i] > 0)
-        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
-    }
-
-    if (m_manage_stream && m_stream != nullptr)
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
-
-    m_cudaDev             = -1;
-    m_multiProcCount      = 0;
-    m_maxWarpCount        = 0;
-    m_maxBlock            = {0, 0, 0};
-    m_maxSharedWords      = 0;
-    m_scratchSpaceCount   = 0;
-    m_scratchFlagsCount   = 0;
-    m_scratchUnifiedCount = 0;
-    m_streamCount         = 0;
-    m_scratchSpace        = nullptr;
-    m_scratchFlags        = nullptr;
-    m_scratchUnified      = nullptr;
-    m_stream              = nullptr;
-    for (int i = 0; i < m_n_team_scratch; ++i) {
-      m_team_scratch_current_size[i] = 0;
-      m_team_scratch_ptr[i]          = nullptr;
-    }
-
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks));
-    m_scratch_locks = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    if (m_team_scratch_current_size[i] > 0)
+      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
   }
 
-  // only destroy these if we're finalizing the singleton
-  if (this == &singleton()) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
-    auto &deep_copy_space =
-        Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
-    if (deep_copy_space)
-      deep_copy_space->impl_internal_space_instance()->finalize();
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
+  if (m_manage_stream && m_stream != nullptr)
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+  m_cudaDev             = -1;
+  m_multiProcCount      = 0;
+  m_maxWarpCount        = 0;
+  m_maxBlock            = {0, 0, 0};
+  m_maxSharedWords      = 0;
+  m_scratchSpaceCount   = 0;
+  m_scratchFlagsCount   = 0;
+  m_scratchUnifiedCount = 0;
+  m_streamCount         = 0;
+  m_scratchSpace        = nullptr;
+  m_scratchFlags        = nullptr;
+  m_scratchUnified      = nullptr;
+  m_stream              = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
   }
+
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks));
+  m_scratch_locks = nullptr;
 }
 
 //----------------------------------------------------------------------------
@@ -839,9 +859,16 @@ int Cuda::impl_is_initialized() {
   return Impl::CudaInternal::singleton().is_initialized();
 }
 
-void Cuda::impl_initialize(const Cuda::SelectDevice config,
-                           size_t /*num_instances*/) {
-  Impl::CudaInternal::singleton().initialize(config.cuda_device_id, nullptr);
+void Cuda::impl_initialize(InitializationSettings const &settings) {
+  Impl::CudaInternal::singleton().initialize(Impl::get_gpu(settings));
+
+  // In order to support setting an atexit hook for Kokkos::finalize
+  // We need to ensure that the Cuda deep_copy instance is not destroyed
+  // before that atexit hook is getting called.
+  // Thus we create the static instance here, so that it will be deallocated
+  // after the potential atexit call.
+  // This is neccessary since we will access that instance in Kokkos::finalize
+  (void)::Kokkos::Impl::cuda_get_deep_copy_space(true);
 }
 
 std::vector<unsigned> Cuda::detect_device_arch() {
@@ -891,20 +918,59 @@ Cuda::Cuda(cudaStream_t stream, bool manage_stream)
                                stream, manage_stream);
 }
 
-void Cuda::print_configuration(std::ostream &s, const bool) {
-  Impl::CudaInternal::singleton().print_configuration(s);
+void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
+  os << "Device Execution Space:\n";
+  os << "  KOKKOS_ENABLE_CUDA: yes\n";
+
+  os << "Cuda Atomics:\n";
+  os << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+
+  os << "Cuda Options:\n";
+  os << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+  os << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+  os << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+  os << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+  os << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+
+  os << "\nCuda Runtime Configuration:\n";
+
+  m_space_instance->print_configuration(os);
 }
 
 void Cuda::impl_static_fence(const std::string &name) {
   Kokkos::Impl::cuda_device_synchronize(name);
 }
-void Cuda::impl_static_fence() {
-  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
-}
 
-void Cuda::fence() const {
-  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
-}
 void Cuda::fence(const std::string &name) const {
   m_space_instance->fence(name);
 }
@@ -922,89 +988,8 @@ const cudaDeviceProp &Cuda::cuda_device_prop() const {
 
 namespace Impl {
 
-int get_gpu(const InitArguments &args);
-
 int g_cuda_space_factory_initialized =
-    initialize_space_factory<CudaSpaceInitializer>("150_Cuda");
-
-void CudaSpaceInitializer::initialize(const InitArguments &args) {
-  int use_gpu = get_gpu(args);
-  if (std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value ||
-      0 < use_gpu) {
-    if (use_gpu > -1) {
-      Kokkos::Cuda::impl_initialize(Kokkos::Cuda::SelectDevice(use_gpu));
-    } else {
-      Kokkos::Cuda::impl_initialize();
-    }
-  }
-}
-
-void CudaSpaceInitializer::finalize(bool all_spaces) {
-  if ((std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value ||
-       all_spaces) &&
-      Kokkos::Cuda::impl_is_initialized()) {
-    Kokkos::Cuda::impl_finalize();
-  }
-}
-
-void CudaSpaceInitializer::fence() {
-  Kokkos::Cuda::impl_static_fence(
-      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
-}
-void CudaSpaceInitializer::fence(const std::string &name) {
-  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
-  // "+name); //TODO: or this
-  Kokkos::Cuda::impl_static_fence(name);
-}
-
-void CudaSpaceInitializer::print_configuration(std::ostream &msg,
-                                               const bool detail) {
-  msg << "Device Execution Space:\n";
-  msg << "  KOKKOS_ENABLE_CUDA: yes\n";
-
-  msg << "Cuda Atomics:\n";
-  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-
-  msg << "Cuda Options:\n";
-  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
-#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
-#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
-#ifdef KOKKOS_ENABLE_CUDA_UVM
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
-#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  msg << "yes\n";
-#else
-  msg << "no\n";
-#endif
-
-  msg << "\nCuda Runtime Configuration:" << std::endl;
-  Cuda::print_configuration(msg, detail);
-}
+    initialize_space_factory<Cuda>("150_Cuda");
 
 }  // namespace Impl
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b7a80ad84..88810b6fc 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -636,7 +636,7 @@ struct CudaParallelLaunchImpl<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
           base_t::get_kernel_func(), prefer_shmem);
 
-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+      ensure_cuda_lock_arrays_on_device();
 
       // Invoke the driver function on the device
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index 1dcbdf039..379653481 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 #ifdef KOKKOS_ENABLE_CUDA
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
@@ -75,8 +79,7 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
 void initialize_host_cuda_lock_arrays() {
 #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
   desul::Impl::init_lock_arrays();
-
-  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+  desul::ensure_cuda_lock_arrays_on_device();
 #endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
   KOKKOS_IMPL_CUDA_SAFE_CALL(
@@ -85,7 +88,7 @@ void initialize_host_cuda_lock_arrays() {
   Impl::cuda_device_synchronize(
       "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
-  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   Impl::cuda_device_synchronize(
@@ -102,7 +105,7 @@ void finalize_host_cuda_lock_arrays() {
   g_host_cuda_lock_arrays.atomic = nullptr;
   g_host_cuda_lock_arrays.n      = 0;
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
 #endif
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index bdb772398..244f142f0 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -67,7 +67,7 @@ struct CudaLockArrays {
 
 /// \brief This global variable in Host space is the central definition
 ///        of these arrays.
-extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
+extern CudaLockArrays g_host_cuda_lock_arrays;
 
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
 ///        valid, initialized arrays.
@@ -105,12 +105,12 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
 __device__
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
     __constant__ extern
 #endif
-    Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
+    CudaLockArrays g_device_cuda_lock_arrays;
 
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 
@@ -123,9 +123,7 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset        = offset >> 2;
   offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (
-      0 ==
-      atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
+  return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
 }
 
 /// \brief Release lock for the address
@@ -138,7 +136,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset        = offset >> 2;
   offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
+  atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
 }
 
 }  // namespace Impl
@@ -151,45 +149,49 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
-}  // namespace Impl
-}  // namespace Kokkos
 
-/* Dan Ibanez: it is critical that this code be a macro, so that it will
-   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
-   putting this in an inline function will NOT do the right thing! */
-#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
-  {                                                                   \
-    if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
-          cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
-                             &Kokkos::Impl::g_host_cuda_lock_arrays,  \
-                             sizeof(Kokkos::Impl::CudaLockArrays)));  \
-    }                                                                 \
-    lock_array_copied = 1;                                            \
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+inline
+#else
+static
+#endif
+    void
+    copy_cuda_lock_arrays_to_device() {
+  if (lock_array_copied == 0) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
+                                                  &g_host_cuda_lock_arrays,
+                                                  sizeof(CudaLockArrays)));
   }
+  lock_array_copied = 1;
+}
 
 #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+inline void ensure_cuda_lock_arrays_on_device() {}
 #else
-#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+inline static void ensure_cuda_lock_arrays_on_device() {
+  copy_cuda_lock_arrays_to_device();
+}
 #endif
 
 #else
 
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+inline void ensure_cuda_lock_arrays_on_device() {}
 #else
 // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
-  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+inline static void ensure_cuda_lock_arrays_on_device() {
+  copy_cuda_lock_arrays_to_device();
+  desul::ensure_cuda_lock_arrays_on_device();
+}
 #endif
 
 #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
 
+}  // namespace Impl
+}  // namespace Kokkos
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
deleted file mode 100644
index 5016f73e3..000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ /dev/null
@@ -1,2722 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_PARALLEL_HPP
-#define KOKKOS_CUDA_PARALLEL_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_CUDA)
-
-#include <algorithm>
-#include <string>
-#include <cstdio>
-#include <cstdint>
-
-#include <utility>
-#include <Kokkos_Parallel.hpp>
-
-#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
-#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
-#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
-#include <Cuda/Kokkos_Cuda_Team.hpp>
-#include <Kokkos_MinMaxClamp.hpp>
-#include <Kokkos_Vectorization.hpp>
-
-#include <impl/Kokkos_Tools.hpp>
-#include <typeinfo>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-#include <impl/KokkosExp_IterateTileGPU.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-extern bool show_warnings() noexcept;
-
-namespace Impl {
-
-template <class... Properties>
-class TeamPolicyInternal<Kokkos::Cuda, Properties...>
-    : public PolicyTraits<Properties...> {
- public:
-  //! Tag this class as a kokkos execution policy
-  using execution_policy = TeamPolicyInternal;
-
-  using traits = PolicyTraits<Properties...>;
-
-  template <class ExecSpace, class... OtherProperties>
-  friend class TeamPolicyInternal;
-
- private:
-  enum { MAX_WARP = 8 };
-
-  typename traits::execution_space m_space;
-  int m_league_size;
-  int m_team_size;
-  int m_vector_length;
-  int m_team_scratch_size[2];
-  int m_thread_scratch_size[2];
-  int m_chunk_size;
-  bool m_tune_team;
-  bool m_tune_vector;
-
- public:
-  //! Execution space of this execution policy
-  using execution_space = Kokkos::Cuda;
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) {
-    m_league_size            = p.m_league_size;
-    m_team_size              = p.m_team_size;
-    m_vector_length          = p.m_vector_length;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-    m_space                  = p.m_space;
-    m_tune_team              = p.m_tune_team;
-    m_tune_vector            = p.m_tune_vector;
-  }
-
-  //----------------------------------------
-
-  template <class FunctorType>
-  int team_size_max(const FunctorType& f, const ParallelForTag&) const {
-    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
-    int block_size =
-        Kokkos::Impl::cuda_get_max_block_size<FunctorType,
-                                              typename traits::launch_bounds>(
-            space().impl_internal_space_instance(), attr, f,
-            (size_t)impl_vector_length(),
-            (size_t)team_scratch_size(0) + 2 * sizeof(double),
-            (size_t)thread_scratch_size(0) + sizeof(double));
-    return block_size / impl_vector_length();
-  }
-
-  template <class FunctorType>
-  inline int team_size_max(const FunctorType& f,
-                           const ParallelReduceTag&) const {
-    using functor_analysis_type =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
-    return internal_team_size_max<closure_type>(f);
-  }
-
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
-                           const ParallelReduceTag&) const {
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
-    return internal_team_size_max<closure_type>(f);
-  }
-
-  template <class FunctorType>
-  int team_size_recommended(const FunctorType& f, const ParallelForTag&) const {
-    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
-    const int block_size =
-        Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
-                                              typename traits::launch_bounds>(
-            space().impl_internal_space_instance(), attr, f,
-            (size_t)impl_vector_length(),
-            (size_t)team_scratch_size(0) + 2 * sizeof(double),
-            (size_t)thread_scratch_size(0) + sizeof(double));
-    return block_size / impl_vector_length();
-  }
-
-  template <class FunctorType>
-  inline int team_size_recommended(const FunctorType& f,
-                                   const ParallelReduceTag&) const {
-    using functor_analysis_type =
-        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                              TeamPolicyInternal, FunctorType>;
-    using reducer_type = typename Impl::ParallelReduceReturnValue<
-        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type;
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             reducer_type>;
-    return internal_team_size_recommended<closure_type>(f);
-  }
-
-  template <class FunctorType, class ReducerType>
-  int team_size_recommended(const FunctorType& f, const ReducerType&,
-                            const ParallelReduceTag&) const {
-    using closure_type =
-        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                             ReducerType>;
-    return internal_team_size_recommended<closure_type>(f);
-  }
-
-  inline static int vector_length_max() { return Impl::CudaTraits::WarpSize; }
-
-  inline static int verify_requested_vector_length(
-      int requested_vector_length) {
-    int test_vector_length =
-        std::min(requested_vector_length, vector_length_max());
-
-    // Allow only power-of-two vector_length
-    if (!(is_integral_power_of_two(test_vector_length))) {
-      int test_pow2 = 1;
-      for (int i = 0; i < 5; i++) {
-        test_pow2 = test_pow2 << 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
-      test_vector_length = test_pow2 >> 1;
-    }
-
-    return test_vector_length;
-  }
-
-  inline static int scratch_size_max(int level) {
-    return (
-        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
-                                  // for team_member.reduce etc.
-            20 * 1024 *
-                1024);  // arbitrarily setting this to 20MB, for a Volta V100
-                        // that would give us about 3.2GB for 2 teams per SM
-  }
-
-  //----------------------------------------
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-  KOKKOS_DEPRECATED inline int vector_length() const {
-    return impl_vector_length();
-  }
-#endif
-  inline int impl_vector_length() const { return m_vector_length; }
-  inline int team_size() const { return m_team_size; }
-  inline int league_size() const { return m_league_size; }
-  inline bool impl_auto_team_size() const { return m_tune_team; }
-  inline bool impl_auto_vector_length() const { return m_tune_vector; }
-  inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; }
-  inline void impl_set_vector_length(size_t vector_length) {
-    m_vector_length = vector_length;
-  }
-  inline int scratch_size(int level, int team_size_ = -1) const {
-    if (team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] +
-           team_size_ * m_thread_scratch_size[level];
-  }
-  inline int team_scratch_size(int level) const {
-    return m_team_scratch_size[level];
-  }
-  inline int thread_scratch_size(int level) const {
-    return m_thread_scratch_size[level];
-  }
-
-  const typename traits::execution_space& space() const { return m_space; }
-
-  TeamPolicyInternal()
-      : m_space(typename traits::execution_space()),
-        m_league_size(0),
-        m_team_size(-1),
-        m_vector_length(0),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(Impl::CudaTraits::WarpSize),
-        m_tune_team(false),
-        m_tune_vector(false) {}
-
-  /** \brief  Specify league size, specify team size, specify vector length */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request, int vector_length_request = 1)
-      : m_space(space_),
-        m_league_size(league_size_),
-        m_team_size(team_size_request),
-        m_vector_length(
-            (vector_length_request > 0)
-                ? verify_requested_vector_length(vector_length_request)
-                : verify_requested_vector_length(1)),
-        m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_chunk_size(Impl::CudaTraits::WarpSize),
-        m_tune_team(bool(team_size_request <= 0)),
-        m_tune_vector(bool(vector_length_request <= 0)) {
-    // Make sure league size is permissible
-    if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0]))
-      Impl::throw_runtime_exception(
-          "Requested too large league_size for TeamPolicy on Cuda execution "
-          "space.");
-
-    // Make sure total block size is permissible
-    if (m_team_size * m_vector_length >
-        int(Impl::CudaTraits::MaxHierarchicalParallelism)) {
-      Impl::throw_runtime_exception(
-          std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
-                      "Team size x vector length must be smaller than 1024."));
-    }
-  }
-
-  /** \brief  Specify league size, request team size, specify vector length */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
-
-  /** \brief  Specify league size, request team size and vector length */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     const Kokkos::AUTO_t& /* team_size_request */,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-                     )
-      : TeamPolicyInternal(space_, league_size_, -1, -1) {}
-
-  /** \brief  Specify league size, specify team size, request vector length */
-  TeamPolicyInternal(const execution_space space_, int league_size_,
-                     int team_size_request, const Kokkos::AUTO_t&)
-      : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {}
-
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request) {}
-
-  TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request)
-
-  {}
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
-                     const Kokkos::AUTO_t& vector_length_request)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request) {}
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(int league_size_, int team_size_request,
-                     const Kokkos::AUTO_t& vector_length_request)
-      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
-                           team_size_request, vector_length_request) {}
-
-  inline int chunk_size() const { return m_chunk_size; }
-
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal& set_chunk_size(
-      typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(const int& level,
-                                              const PerTeamValue& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerThreadValue& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerTeamValue& per_team,
-      const PerThreadValue& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  using member_type = Kokkos::Impl::CudaTeamMember;
-
- protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
-    using functor_value_traits =
-        Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
-
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
-            get_cuda_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        (size_t)impl_vector_length(),
-        (size_t)team_scratch_size(0) + 2 * sizeof(double),
-        (size_t)thread_scratch_size(0) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
-
-    // Currently we require Power-of-2 team size for reductions.
-    int p2 = 1;
-    while (p2 <= block_size) p2 *= 2;
-    p2 /= 2;
-    return p2 / impl_vector_length();
-  }
-
-  template <class ClosureType, class FunctorType>
-  int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f,
-        Kokkos::Impl::cuda_get_max_block_size<FunctorType,
-                                              typename traits::launch_bounds>);
-  }
-
-  template <class ClosureType, class FunctorType>
-  int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f,
-        Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
-                                              typename traits::launch_bounds>);
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  ParallelFor()        = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
-  template <class TagType>
-  inline __device__
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member i) const {
-    m_functor(i);
-  }
-
-  template <class TagType>
-  inline __device__
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member i) const {
-    m_functor(TagType(), i);
-  }
-
- public:
-  using functor_type = FunctorType;
-
-  Policy const& get_policy() const { return m_policy; }
-
-  inline __device__ void operator()() const {
-    const Member work_stride = blockDim.y * gridDim.x;
-    const Member work_end    = m_policy.end();
-
-    for (Member iwork =
-             m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x;
-         iwork < work_end;
-         iwork = iwork < work_end - work_stride ? iwork + work_stride
-                                                : work_end) {
-      this->template exec_range<WorkTag>(iwork);
-    }
-  }
-
-  inline void execute() const {
-    const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
-
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    const int block_size =
-        Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-            m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
-            0, 0);
-    KOKKOS_ASSERT(block_size > 0);
-    dim3 block(1, block_size, 1);
-    dim3 grid(
-        std::min(
-            typename Policy::index_type((nwork + block.y - 1) / block.y),
-            typename Policy::index_type(cuda_internal_maximum_grid_count()[0])),
-        1, 1);
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
-      block = dim3(1, 1, 1);
-      grid  = dim3(1, 1, 1);
-    }
-#endif
-
-    CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
-        false);
-  }
-
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
- public:
-  using Policy       = Kokkos::MDRangePolicy<Traits...>;
-  using functor_type = FunctorType;
-
- private:
-  using RP               = Policy;
-  using array_index_type = typename Policy::array_index_type;
-  using index_type       = typename Policy::index_type;
-  using LaunchBounds     = typename Policy::launch_bounds;
-
-  const FunctorType m_functor;
-  const Policy m_rp;
-
- public:
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-  }
-  Policy const& get_policy() const { return m_rp; }
-  inline __device__ void operator()() const {
-    Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
-                                    typename Policy::work_tag>(m_rp, m_functor)
-        .exec_range();
-  }
-
-  inline void execute() const {
-    using namespace std;
-
-    if (m_rp.m_num_tiles == 0) return;
-    const auto maxblocks = cuda_internal_maximum_grid_count();
-    if (RP::rank == 2) {
-      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
-      KOKKOS_ASSERT(block.x > 0);
-      KOKKOS_ASSERT(block.y > 0);
-      const dim3 grid(
-          std::min<array_index_type>(
-              (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
-              maxblocks[0]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
-              maxblocks[1]),
-          1);
-      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-    } else if (RP::rank == 3) {
-      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
-      KOKKOS_ASSERT(block.x > 0);
-      KOKKOS_ASSERT(block.y > 0);
-      KOKKOS_ASSERT(block.z > 0);
-      const dim3 grid(
-          std::min<array_index_type>(
-              (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
-              maxblocks[0]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
-              maxblocks[1]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
-              maxblocks[2]));
-      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-    } else if (RP::rank == 4) {
-      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
-      // threadIdx.z
-      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
-                       m_rp.m_tile[3]);
-      KOKKOS_ASSERT(block.y > 0);
-      KOKKOS_ASSERT(block.z > 0);
-      const dim3 grid(
-          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
-                                     maxblocks[0]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
-              maxblocks[1]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
-              maxblocks[2]));
-      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-    } else if (RP::rank == 5) {
-      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
-      // threadIdx.z
-      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
-                       m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
-      KOKKOS_ASSERT(block.z > 0);
-      const dim3 grid(
-          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
-                                     maxblocks[0]),
-          std::min<array_index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3],
-                                     maxblocks[1]),
-          std::min<array_index_type>(
-              (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
-              maxblocks[2]));
-      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-    } else if (RP::rank == 6) {
-      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
-      // threadIdx.z
-      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
-                       m_rp.m_tile[2] * m_rp.m_tile[3],
-                       m_rp.m_tile[4] * m_rp.m_tile[5]);
-      const dim3 grid(
-          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
-                                     maxblocks[0]),
-          std::min<array_index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3],
-                                     maxblocks[1]),
-          std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
-                                     maxblocks[2]));
-      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-    } else {
-      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
-    }
-
-  }  // end execute
-
-  //  inline
-  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
-      : m_functor(arg_functor), m_rp(arg_policy) {}
-};
-
-__device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size,
-                                                 int32_t* scratch_locks) {
-  int64_t threadid = 0;
-  __shared__ int64_t base_thread_id;
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    int64_t const wraparound_len = Kokkos::Experimental::min(
-        int64_t(league_size),
-        (int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n)) /
-            (blockDim.x * blockDim.y));
-    threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
-    threadid *= blockDim.x * blockDim.y;
-    int done = 0;
-    while (!done) {
-      done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1));
-      if (!done) {
-        threadid += blockDim.x * blockDim.y;
-        if (int64_t(threadid + blockDim.x * blockDim.y) >=
-            wraparound_len * blockDim.x * blockDim.y)
-          threadid = 0;
-      }
-    }
-    base_thread_id = threadid;
-  }
-  __syncthreads();
-  threadid = base_thread_id;
-  return threadid;
-}
-
-__device__ inline void cuda_release_scratch_index(int32_t* scratch_locks,
-                                                  int64_t threadid) {
-  __syncthreads();
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    scratch_locks[threadid] = 0;
-  }
-}
-
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Cuda> {
- public:
-  using Policy = TeamPolicy<Properties...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using LaunchBounds = typename Policy::launch_bounds;
-
- public:
-  using functor_type = FunctorType;
-  using size_type    = Cuda::size_type;
-
- private:
-  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
-  // blockDim.z == 1 shared memory utilization:
-  //
-  //  [ team   reduce space ]
-  //  [ team   shared space ]
-  //
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const size_type m_league_size;
-  int m_team_size;
-  const size_type m_vector_size;
-  int m_shmem_begin;
-  int m_shmem_size;
-  void* m_scratch_ptr[2];
-  int m_scratch_size[2];
-  int m_scratch_pool_id = -1;
-  int32_t* m_scratch_locks;
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_team(const Member& member) const {
-    m_functor(member);
-  }
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_team(const Member& member) const {
-    m_functor(TagType(), member);
-  }
-
- public:
-  Policy const& get_policy() const { return m_policy; }
-
-  __device__ inline void operator()() const {
-    // Iterate this block through the league
-    int64_t threadid = 0;
-    if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
-    }
-
-    const int int_league_size = (int)m_league_size;
-    for (int league_rank = blockIdx.x; league_rank < int_league_size;
-         league_rank += gridDim.x) {
-      this->template exec_team<WorkTag>(typename Policy::member_type(
-          kokkos_impl_cuda_shared_memory<void>(), m_shmem_begin, m_shmem_size,
-          (void*)(((char*)m_scratch_ptr[1]) +
-                  ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
-                      m_scratch_size[1]),
-          m_scratch_size[1], league_rank, m_league_size));
-    }
-    if (m_scratch_size[1] > 0) {
-      cuda_release_scratch_index(m_scratch_locks, threadid);
-    }
-  }
-
-  inline void execute() const {
-    const int64_t shmem_size_total = m_shmem_begin + m_shmem_size;
-    dim3 grid(int(m_league_size), 1, 1);
-    const dim3 block(int(m_vector_size), int(m_team_size), 1);
-
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
-      grid = dim3(1, 1, 1);
-    }
-#endif
-
-    CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
-        true);  // copy to device and execute
-  }
-
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
-    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
-    m_shmem_size =
-        (m_policy.scratch_size(0, m_team_size) +
-         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks =
-        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-    m_scratch_ptr[0] = nullptr;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      auto scratch_ptr_id =
-          m_policy.space()
-              .impl_internal_space_instance()
-              ->resize_team_scratch_space(
-                  static_cast<std::int64_t>(m_scratch_size[1]) *
-                  (std::min(
-                      static_cast<std::int64_t>(Cuda::concurrency() /
-                                                (m_team_size * m_vector_size)),
-                      static_cast<std::int64_t>(m_league_size))));
-      m_scratch_ptr[1]  = scratch_ptr_id.first;
-      m_scratch_pool_id = scratch_ptr_id.second;
-    }
-
-    const int shmem_size_total = m_shmem_begin + m_shmem_size;
-    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-        shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
-    }
-
-    if (int(m_team_size) >
-        int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
-      Kokkos::Impl::throw_runtime_exception(std::string(
-          "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
-    }
-  }
-
-  ~ParallelFor() {
-    if (m_scratch_pool_id >= 0) {
-      m_policy.space()
-          .impl_internal_space_instance()
-          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
-    }
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Cuda> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using WorkRange    = typename Policy::WorkRange;
-  using WorkTag      = typename Policy::work_tag;
-  using Member       = typename Policy::member_type;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
- public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using functor_type   = FunctorType;
-  // Conditionally set word_size_type to int16_t or int8_t if value_type is
-  // smaller than int32_t (Kokkos::Cuda::size_type)
-  // word_size_type is used to determine the word count, shared memory buffer
-  // size, and global memory buffer size before the reduction is performed.
-  // Within the reduction, the word count is recomputed based on word_size_type
-  // and when calculating indexes into the shared/global memory buffers for
-  // performing the reduction, word_size_type is used again.
-  // For scalars > 4 bytes in size, indexing into shared/global memory relies
-  // on the block and grid dimensions to ensure that we index at the correct
-  // offset rather than at every 4 byte word; such that, when the join is
-  // performed, we have the correct data that was copied over in chunks of 4
-  // bytes.
-  using word_size_type = typename std::conditional<
-      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
-      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
-      Kokkos::Cuda::size_type>::type;
-  using index_type   = typename Policy::index_type;
-  using reducer_type = ReducerType;
-
-  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
-  // blockDim.z == 1
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  const bool m_result_ptr_host_accessible;
-  word_size_type* m_scratch_space;
-  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
-  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
-  Cuda::size_type* m_scratch_flags;
-  word_size_type* m_unified_space;
-
-  // Shall we use the shfl based reduction or not (only use it for static sized
-  // types of more than 128bit)
-  enum {
-    UseShflReduction = false
-  };  //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize)
-      //};
-      // Some crutch to do function overloading
- private:
-  using DummyShflReductionType  = double;
-  using DummySHMEMReductionType = int;
-
- public:
-  Policy const& get_policy() const { return m_policy; }
-
-  // Make the exec_range calls call to Reduce::DeviceIterateTile
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update) const {
-    m_functor(i, update);
-  }
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update) const {
-    m_functor(TagType(), i, update);
-  }
-
-  __device__ inline void operator()() const {
-    /*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
-      DummySHMEMReductionType>::select(1,1.0) );
-      }
-
-      __device__ inline
-      void run(const DummySHMEMReductionType& ) const
-      {*/
-    const integral_nonzero_constant<
-        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
-        word_count(ValueTraits::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(word_size_type));
-
-    {
-      reference_type value =
-          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<word_size_type>() +
-                              threadIdx.y * word_count.value);
-
-      // Number of blocks is bounded so that the reduction can be limited to two
-      // passes. Each thread block is given an approximately equal amount of
-      // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
-      // equivalent.
-
-      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-      for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
-           iwork < iwork_end; iwork += blockDim.y) {
-        this->template exec_range<WorkTag>(iwork, value);
-      }
-    }
-
-    // Doing code duplication here to fix issue #3428
-    // Suspect optimizer bug??
-    // Reduce with final value at blockDim.y - 1 location.
-    // Shortcut for length zero reduction
-    if (m_policy.begin() == m_policy.end()) {
-      // This is the final block with the final result at the final threads'
-      // location
-
-      word_size_type* const shared =
-          kokkos_impl_cuda_shared_memory<word_size_type>() +
-          (blockDim.y - 1) * word_count.value;
-      word_size_type* const global =
-          m_result_ptr_device_accessible
-              ? reinterpret_cast<word_size_type*>(m_result_ptr)
-              : (m_unified_space ? m_unified_space : m_scratch_space);
-
-      if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), shared);
-      }
-
-      if (CudaTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }
-
-      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-        global[i] = shared[i];
-      }
-      // return ;
-    }
-
-    if (m_policy.begin() != m_policy.end()) {
-      {
-        if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
-                                                WorkTagFwd>(
-                ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
-                m_scratch_space, m_scratch_flags)) {
-          // This is the final block with the final result at the final threads'
-          // location
-
-          word_size_type* const shared =
-              kokkos_impl_cuda_shared_memory<word_size_type>() +
-              (blockDim.y - 1) * word_count.value;
-          word_size_type* const global =
-              m_result_ptr_device_accessible
-                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
-                  : (m_unified_space ? m_unified_space : m_scratch_space);
-
-          if (threadIdx.y == 0) {
-            Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-                ReducerConditional::select(m_functor, m_reducer), shared);
-          }
-
-          if (CudaTraits::WarpSize < word_count.value) {
-            __syncthreads();
-          }
-
-          for (unsigned i = threadIdx.y; i < word_count.value;
-               i += blockDim.y) {
-            global[i] = shared[i];
-          }
-        }
-      }
-    }
-  }
-  /*  __device__ inline
-     void run(const DummyShflReductionType&) const
-     {
-       value_type value;
-       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
-     &value);
-       // Number of blocks is bounded so that the reduction can be limited to
-     two passes.
-       // Each thread block is given an approximately equal amount of work to
-     perform.
-       // Accumulate the values for this block.
-       // The accumulation ordering does not match the final pass, but is
-     arithmatically equivalent.
-
-       const WorkRange range( m_policy , blockIdx.x , gridDim.x );
-
-       for ( Member iwork = range.begin() + threadIdx.y , iwork_end =
-     range.end() ; iwork < iwork_end ; iwork += blockDim.y ) { this-> template
-     exec_range< WorkTag >( iwork , value );
-       }
-
-       pointer_type const result = (pointer_type) (m_unified_space ?
-     m_unified_space : m_scratch_space) ;
-
-       int max_active_thread = range.end()-range.begin() < blockDim.y ?
-     range.end() - range.begin():blockDim.y;
-
-       max_active_thread = (max_active_thread ==
-     0)?blockDim.y:max_active_thread;
-
-      value_type init;
-      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
-     &init);
-       if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
-              (value,init,ValueJoin(ReducerConditional::select(m_functor ,
-     m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
-         const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
-         if(id==0) {
-           Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final(
-     ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
-           *result = value;
-         }
-       }
-     }*/
-
-  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    unsigned n = CudaTraits::WarpSize * 8;
-    int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
-    }
-    return n;
-  }
-
-  inline void execute() {
-    const index_type nwork     = m_policy.end() - m_policy.begin();
-    const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
-                                 ReduceFunctorHasFinal<FunctorType>::value ||
-                                 !m_result_ptr_host_accessible ||
-#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-                                 Policy::is_graph_kernel::value ||
-#endif
-                                 !std::is_same<ReducerType, InvalidType>::value;
-    if ((nwork > 0) || need_device_set) {
-      const int block_size = local_block_size(m_functor);
-
-      KOKKOS_ASSERT(block_size > 0);
-
-      // TODO: down casting these uses more space than required?
-      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_size /* block_size == max block_count */);
-
-      // Intentionally do not downcast to word_size_type since we use Cuda
-      // atomics in Kokkos_Cuda_ReduceScan.hpp
-      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
-                                                    sizeof(Cuda::size_type));
-      m_unified_space =
-          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
-              m_policy.space(),
-              ValueTraits::value_size(
-                  ReducerConditional::select(m_functor, m_reducer))));
-
-      // REQUIRED ( 1 , N , 1 )
-      dim3 block(1, block_size, 1);
-      // Required grid.x <= block.y
-      dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1,
-                1);
-
-      // TODO @graph We need to effectively insert this in to the graph
-      const int shmem =
-          UseShflReduction
-              ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(m_functor,
-                                                                   block.y);
-
-      if ((nwork == 0)
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-          || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
-#endif
-      ) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      }
-
-      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence(
-            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
-            "Not Device Accessible");
-
-        if (m_result_ptr) {
-          if (m_unified_space) {
-            const int count = ValueTraits::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
-            for (int i = 0; i < count; ++i) {
-              m_result_ptr[i] = pointer_type(m_unified_space)[i];
-            }
-          } else {
-            const int size = ValueTraits::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
-            DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
-          }
-        }
-      }
-    } else {
-      if (m_result_ptr) {
-        // TODO @graph We need to effectively insert this in to the graph
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Cuda> {
- public:
-  using Policy = Kokkos::MDRangePolicy<Traits...>;
-
- private:
-  using array_index_type = typename Policy::array_index_type;
-  using index_type       = typename Policy::index_type;
-
-  using WorkTag      = typename Policy::work_tag;
-  using Member       = typename Policy::member_type;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
- public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using functor_type   = FunctorType;
-  using size_type      = Cuda::size_type;
-  using reducer_type   = ReducerType;
-
-  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
-  // blockDim.z == 1
-
-  const FunctorType m_functor;
-  const Policy m_policy;  // used for workrange and nwork
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
-
-  using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
-      Policy::rank, Policy, FunctorType, typename Policy::work_tag,
-      reference_type>;
-
-  // Shall we use the shfl based reduction or not (only use it for static sized
-  // types of more than 128bit
-  static constexpr bool UseShflReduction = false;
-  //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize)
-  // Some crutch to do function overloading
- private:
-  using DummyShflReductionType  = double;
-  using DummySHMEMReductionType = int;
-
- public:
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-  }
-  Policy const& get_policy() const { return m_policy; }
-  inline __device__ void exec_range(reference_type update) const {
-    Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType,
-                                            typename Policy::work_tag,
-                                            reference_type>(m_policy, m_functor,
-                                                            update)
-        .exec_range();
-  }
-
-  inline __device__ void operator()() const {
-    /*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
-      DummySHMEMReductionType>::select(1,1.0) );
-      }
-
-      __device__ inline
-      void run(const DummySHMEMReductionType& ) const
-      {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
-
-    {
-      reference_type value =
-          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
-                              threadIdx.y * word_count.value);
-
-      // Number of blocks is bounded so that the reduction can be limited to two
-      // passes. Each thread block is given an approximately equal amount of
-      // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
-      // equivalent.
-
-      this->exec_range(value);
-    }
-
-    // Reduce with final value at blockDim.y - 1 location.
-    // Problem: non power-of-two blockDim
-    if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, WorkTagFwd>(
-            ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-            gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
-            m_scratch_space, m_scratch_flags)) {
-      // This is the final block with the final result at the final threads'
-      // location
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
-          m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
-              : (m_unified_space ? m_unified_space : m_scratch_space);
-
-      if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), shared);
-      }
-
-      if (CudaTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }
-
-      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-        global[i] = shared[i];
-      }
-    }
-  }
-
-  /*  __device__ inline
-     void run(const DummyShflReductionType&) const
-     {
-
-       value_type value;
-       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
-     &value);
-       // Number of blocks is bounded so that the reduction can be limited to
-     two passes.
-       // Each thread block is given an approximately equal amount of work to
-     perform.
-       // Accumulate the values for this block.
-       // The accumulation ordering does not match the final pass, but is
-     arithmatically equivalent.
-
-       const Member work_part =
-         ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion
-     of tiles handled by each block
-
-       this-> exec_range( value );
-
-       pointer_type const result = (pointer_type) (m_unified_space ?
-     m_unified_space : m_scratch_space) ;
-
-       int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
-       max_active_thread = (max_active_thread ==
-     0)?blockDim.y:max_active_thread;
-
-       value_type init;
-       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
-     &init);
-       if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
-           (value,init,ValueJoin(ReducerConditional::select(m_functor ,
-     m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
-         const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
-         if(id==0) {
-           Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final(
-     ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
-           *result = value;
-         }
-       }
-     }
-  */
-  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    unsigned n = CudaTraits::WarpSize * 8;
-    int shmem_size =
-        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<closure_type,
-                           LaunchBounds>::get_cuda_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n);
-    }
-    return n;
-  }
-
-  inline void execute() {
-    const auto nwork = m_policy.m_num_tiles;
-    if (nwork) {
-      int block_size = m_policy.m_prod_tile_dims;
-      // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
-      // Nearest power of two
-      int exponent_pow_two    = std::ceil(std::log2(block_size));
-      block_size              = std::pow(2, exponent_pow_two);
-      int suggested_blocksize = local_block_size(m_functor);
-
-      block_size = (block_size > suggested_blocksize)
-                       ? block_size
-                       : suggested_blocksize;  // Note: block_size must be less
-                                               // than or equal to 512
-
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
-
-      // REQUIRED ( 1 , N , 1 )
-      const dim3 block(1, block_size, 1);
-      // Required grid.x <= block.y
-      const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1);
-
-      // TODO @graph We need to effectively insert this in to the graph
-      const int shmem =
-          UseShflReduction
-              ? 0
-              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                          WorkTag>(m_functor,
-                                                                   block.y);
-
-      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence(
-            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
-            "Result Not Device Accessible");
-
-        if (m_result_ptr) {
-          if (m_unified_space) {
-            const int count = ValueTraits::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
-            for (int i = 0; i < count; ++i) {
-              m_result_ptr[i] = pointer_type(m_unified_space)[i];
-            }
-          } else {
-            const int size = ValueTraits::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
-            DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
-          }
-        }
-      }
-    } else {
-      if (m_result_ptr) {
-        // TODO @graph We need to effectively insert this in to the graph
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr) {
-    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Cuda> {
- public:
-  using Policy = TeamPolicy<Properties...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using value_type     = typename ValueTraits::value_type;
-
- public:
-  using functor_type = FunctorType;
-  using size_type    = Cuda::size_type;
-  using reducer_type = ReducerType;
-
-  enum : bool {
-    UseShflReduction = (true && (ValueTraits::StaticValueSize != 0))
-  };
-
- private:
-  using DummyShflReductionType  = double;
-  using DummySHMEMReductionType = int;
-
-  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
-  // blockDim.z == 1 shared memory utilization:
-  //
-  //  [ global reduce space ]
-  //  [ team   reduce space ]
-  //  [ team   shared space ]
-  //
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const bool m_result_ptr_device_accessible;
-  const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
-  size_type m_team_begin;
-  size_type m_shmem_begin;
-  size_type m_shmem_size;
-  void* m_scratch_ptr[2];
-  int m_scratch_size[2];
-  int m_scratch_pool_id = -1;
-  int32_t* m_scratch_locks;
-  const size_type m_league_size;
-  int m_team_size;
-  const size_type m_vector_size;
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_team(const Member& member, reference_type update) const {
-    m_functor(member, update);
-  }
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_team(const Member& member, reference_type update) const {
-    m_functor(TagType(), member, update);
-  }
-
- public:
-  Policy const& get_policy() const { return m_policy; }
-
-  __device__ inline void operator()() const {
-    int64_t threadid = 0;
-    if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
-    }
-
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
-                           DummySHMEMReductionType>::select(1, 1.0),
-        threadid);
-    if (m_scratch_size[1] > 0) {
-      cuda_release_scratch_index(m_scratch_locks, threadid);
-    }
-  }
-
-  __device__ inline void run(const DummySHMEMReductionType&,
-                             const int& threadid) const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(
-                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
-
-    reference_type value =
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        kokkos_impl_cuda_shared_memory<size_type>() +
-                            threadIdx.y * word_count.value);
-
-    // Iterate this block through the league
-    const int int_league_size = (int)m_league_size;
-    for (int league_rank = blockIdx.x; league_rank < int_league_size;
-         league_rank += gridDim.x) {
-      this->template exec_team<WorkTag>(
-          Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
-                 m_shmem_begin, m_shmem_size,
-                 (void*)(((char*)m_scratch_ptr[1]) +
-                         ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
-                             m_scratch_size[1]),
-                 m_scratch_size[1], league_rank, m_league_size),
-          value);
-    }
-
-    // Reduce with final value at blockDim.y - 1 location.
-    // Doing code duplication here to fix issue #3428
-    // Suspect optimizer bug??
-    if (m_league_size == 0) {
-      // This is the final block with the final result at the final threads'
-      // location
-
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
-          m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
-              : (m_unified_space ? m_unified_space : m_scratch_space);
-
-      if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), shared);
-      }
-
-      if (CudaTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }
-
-      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-        global[i] = shared[i];
-      }
-    }
-
-    if (m_league_size != 0) {
-      if (cuda_single_inter_block_reduce_scan<false, FunctorType, WorkTag>(
-              ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-              gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
-              m_scratch_space, m_scratch_flags)) {
-        // This is the final block with the final result at the final threads'
-        // location
-
-        size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                  (blockDim.y - 1) * word_count.value;
-        size_type* const global =
-            m_result_ptr_device_accessible
-                ? reinterpret_cast<size_type*>(m_result_ptr)
-                : (m_unified_space ? m_unified_space : m_scratch_space);
-
-        if (threadIdx.y == 0) {
-          Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-              ReducerConditional::select(m_functor, m_reducer), shared);
-        }
-
-        if (CudaTraits::WarpSize < word_count.value) {
-          __syncthreads();
-        }
-
-        for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
-          global[i] = shared[i];
-        }
-      }
-    }
-  }
-
-  __device__ inline void run(const DummyShflReductionType&,
-                             const int& threadid) const {
-    value_type value;
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value);
-
-    // Iterate this block through the league
-    const int int_league_size = (int)m_league_size;
-    for (int league_rank = blockIdx.x; league_rank < int_league_size;
-         league_rank += gridDim.x) {
-      this->template exec_team<WorkTag>(
-          Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
-                 m_shmem_begin, m_shmem_size,
-                 (void*)(((char*)m_scratch_ptr[1]) +
-                         ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
-                             m_scratch_size[1]),
-                 m_scratch_size[1], league_rank, m_league_size),
-          value);
-    }
-
-    pointer_type const result =
-        m_result_ptr_device_accessible
-            ? m_result_ptr
-            : (pointer_type)(m_unified_space ? m_unified_space
-                                             : m_scratch_space);
-
-    value_type init;
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init);
-
-    if (int_league_size == 0) {
-      Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-          ReducerConditional::select(m_functor, m_reducer), (void*)&value);
-      *result = value;
-    } else if (
-        Impl::cuda_inter_block_reduction<FunctorType, ValueJoin, WorkTag>(
-            value, init,
-            ValueJoin(ReducerConditional::select(m_functor, m_reducer)),
-            m_scratch_space, result, m_scratch_flags, blockDim.y)
-        // This breaks a test
-        //   Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor
-        //   , m_reducer) , blockIdx.x , gridDim.x ,
-        //              kokkos_impl_cuda_shared_memory<size_type>() ,
-        //              m_scratch_space , m_scratch_flags)
-    ) {
-      const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
-      if (id == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), (void*)&value);
-        *result = value;
-      }
-    }
-  }
-
-  inline void execute() {
-    const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
-    const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
-                                 ReduceFunctorHasFinal<FunctorType>::value ||
-                                 !m_result_ptr_host_accessible ||
-#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-                                 Policy::is_graph_kernel::value ||
-#endif
-                                 !std::is_same<ReducerType, InvalidType>::value;
-    if (!is_empty_range || need_device_set) {
-      const int block_count =
-          UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
-                           : std::min(int(m_league_size), m_team_size);
-
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)) *
-                                block_count);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
-
-      dim3 block(m_vector_size, m_team_size, 1);
-      dim3 grid(block_count, 1, 1);
-      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-      if (is_empty_range
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-          || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
-#endif
-      ) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      }
-
-      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-          *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
-          true);  // copy to device and execute
-
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence(
-            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
-            "Not Device Accessible");
-
-        if (m_result_ptr) {
-          if (m_unified_space) {
-            const int count = ValueTraits::value_count(
-                ReducerConditional::select(m_functor, m_reducer));
-            for (int i = 0; i < count; ++i) {
-              m_result_ptr[i] = pointer_type(m_unified_space)[i];
-            }
-          } else {
-            const int size = ValueTraits::value_size(
-                ReducerConditional::select(m_functor, m_reducer));
-            DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
-          }
-        }
-      }
-    } else {
-      if (m_result_ptr) {
-        // TODO @graph We need to effectively insert this in to the graph
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-      }
-    }
-  }
-
-  template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr),
-        m_team_begin(0),
-        m_shmem_begin(0),
-        m_shmem_size(0),
-        m_scratch_ptr{nullptr, nullptr},
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
-    m_team_begin =
-        UseShflReduction
-            ? 0
-            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                        WorkTag>(arg_functor,
-                                                                 m_team_size);
-    m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks =
-        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      auto scratch_ptr_id =
-          m_policy.space()
-              .impl_internal_space_instance()
-              ->resize_team_scratch_space(
-                  static_cast<std::int64_t>(m_scratch_size[1]) *
-                  (std::min(
-                      static_cast<std::int64_t>(Cuda::concurrency() /
-                                                (m_team_size * m_vector_size)),
-                      static_cast<std::int64_t>(m_league_size))));
-      m_scratch_ptr[1]  = scratch_ptr_id.first;
-      m_scratch_pool_id = scratch_ptr_id.second;
-    }
-
-    // The global parallel_reduce does not support vector_length other than 1 at
-    // the moment
-    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
-          "greater than 1 is not currently supported for CUDA for dynamic "
-          "sized reduction types.");
-
-    if ((m_team_size < 32) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
-          "than 32 is not currently supported with CUDA for dynamic sized "
-          "reduction types.");
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-
-    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-    if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
-        !UseShflReduction) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
-    }
-
-    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-        shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
-                      "L0 scratch memory"));
-    }
-
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
-                      "large team size."));
-    }
-  }
-
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_result_ptr_device_accessible(
-            MemorySpaceAccess<Kokkos::CudaSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_result_ptr_host_accessible(
-            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_unified_space(nullptr),
-        m_team_begin(0),
-        m_shmem_begin(0),
-        m_shmem_size(0),
-        m_scratch_ptr{nullptr, nullptr},
-        m_league_size(arg_policy.league_size()),
-        m_team_size(arg_policy.team_size()),
-        m_vector_size(arg_policy.impl_vector_length()) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-
-    // Valid team size not provided, deduce team size
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
-    m_team_begin =
-        UseShflReduction
-            ? 0
-            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                        WorkTag>(arg_functor,
-                                                                 m_team_size);
-    m_shmem_begin = sizeof(double) * (m_team_size + 2);
-    m_shmem_size =
-        m_policy.scratch_size(0, m_team_size) +
-        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks =
-        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
-    if (m_team_size <= 0) {
-      m_scratch_ptr[1] = nullptr;
-    } else {
-      auto scratch_ptr_id =
-          m_policy.space()
-              .impl_internal_space_instance()
-              ->resize_team_scratch_space(
-                  static_cast<std::int64_t>(m_scratch_size[1]) *
-                  (std::min(
-                      static_cast<std::int64_t>(Cuda::concurrency() /
-                                                (m_team_size * m_vector_size)),
-                      static_cast<std::int64_t>(m_league_size))));
-      m_scratch_ptr[1]  = scratch_ptr_id.first;
-      m_scratch_pool_id = scratch_ptr_id.second;
-    }
-
-    // The global parallel_reduce does not support vector_length other than 1 at
-    // the moment
-    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
-          "greater than 1 is not currently supported for CUDA for dynamic "
-          "sized reduction types.");
-
-    if ((m_team_size < 32) && !UseShflReduction)
-      Impl::throw_runtime_exception(
-          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
-          "than 32 is not currently supported with CUDA for dynamic sized "
-          "reduction types.");
-
-    // Functor's reduce memory, team scan memory, and team shared memory depend
-    // upon team size.
-
-    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
-    if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
-         !UseShflReduction) ||
-        m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-            shmem_size_total) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
-    }
-
-    size_type team_size_max =
-        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-            m_policy.space().impl_internal_space_instance(), attr, m_functor,
-            m_vector_size, m_policy.team_scratch_size(0),
-            m_policy.thread_scratch_size(0)) /
-        m_vector_size;
-
-    if ((int)m_team_size > (int)team_size_max) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
-                      "large team size."));
-    }
-  }
-
-  ~ParallelReduce() {
-    if (m_scratch_pool_id >= 0) {
-      m_policy.space()
-          .impl_internal_space_instance()
-          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
-    }
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using WorkRange    = typename Policy::WorkRange;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
- public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using functor_type   = FunctorType;
-  using size_type      = Cuda::size_type;
-
- private:
-  // Algorithmic constraints:
-  //  (a) blockDim.y is a power of two
-  //  (b) blockDim.y == blockDim.z == 1
-  //  (c) gridDim.x  <= blockDim.y * blockDim.y
-  //  (d) gridDim.y  == gridDim.z == 1
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type m_final;
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-  bool m_run_serial;
-#endif
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
-    m_functor(i, update, final_result);
-  }
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
-    m_functor(TagType(), i, update, final_result);
-  }
-
-  //----------------------------------------
-
-  __device__ inline void initial() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
-
-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
-        word_count.value * threadIdx.y;
-
-    ValueInit::init(m_functor, shared_value);
-
-    // Number of blocks is bounded so that the reduction can be limited to two
-    // passes. Each thread block is given an approximately equal amount of work
-    // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
-
-    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
-         iwork < iwork_end; iwork += blockDim.y) {
-      this->template exec_range<WorkTag>(
-          iwork, ValueOps::reference(shared_value), false);
-    }
-
-    // Reduce and scan, writing out scan of blocks' totals and block-groups'
-    // totals. Blocks' scan values are written to 'blockIdx.x' location.
-    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
-    // gridDim.x
-    cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>(
-        m_functor, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
-        m_scratch_flags);
-  }
-
-  //----------------------------------------
-
-  __device__ inline void final() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
-
-    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
-    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
-        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
-        shared_data + word_count.value * (blockDim.y + 1);
-
-    // Starting value for this thread block is the previous block's total.
-    if (blockIdx.x) {
-      size_type* const block_total =
-          m_scratch_space + word_count.value * (blockIdx.x - 1);
-      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-        shared_accum[i] = block_total[i];
-      }
-    } else if (0 == threadIdx.y) {
-      ValueInit::init(m_functor, shared_accum);
-    }
-
-    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-    for (typename Policy::member_type iwork_base = range.begin();
-         iwork_base < range.end(); iwork_base += blockDim.y) {
-      unsigned MASK                            = __activemask();
-      const typename Policy::member_type iwork = iwork_base + threadIdx.y;
-
-      __syncthreads();  // Don't overwrite previous iteration values until they
-                        // are used
-
-      ValueInit::init(m_functor, shared_prefix + word_count.value);
-
-      // Copy previous block's accumulation total into thread[0] prefix and
-      // inclusive scan value of this block
-      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
-      }
-      __syncwarp(MASK);
-      if (CudaTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }  // Protect against large scan values.
-
-      // Call functor to accumulate inclusive scan value for this work item
-      if (iwork < range.end()) {
-        this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix + word_count.value),
-            false);
-      }
-
-      // Scan block values into locations shared_data[1..blockDim.y]
-      cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>(
-          m_functor,
-          typename ValueTraits::pointer_type(shared_data + word_count.value));
-
-      {
-        size_type* const block_total =
-            shared_data + word_count.value * blockDim.y;
-        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-          shared_accum[i] = block_total[i];
-        }
-      }
-
-      // Call functor with exclusive scan value
-      if (iwork < range.end()) {
-        this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix), true);
-      }
-    }
-  }
-
- public:
-  Policy const& get_policy() const { return m_policy; }
-
-  //----------------------------------------
-
-  __device__ inline void operator()() const {
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    if (m_run_serial) {
-      typename ValueTraits::value_type value;
-      ValueInit::init(m_functor, (void*)&value);
-      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-      for (typename Policy::member_type iwork_base = range.begin();
-           iwork_base < range.end(); iwork_base++) {
-        this->template exec_range<WorkTag>(iwork_base, value, true);
-      }
-    } else {
-#endif
-      if (!m_final) {
-        initial();
-      } else {
-        final();
-      }
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    }
-#endif
-  }
-
-  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
-    // (16 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
-    // testing
-
-    unsigned n = CudaTraits::WarpSize * 4;
-    while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
-               cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                         WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
-
-  inline void execute() {
-    const auto nwork = m_policy.end() - m_policy.begin();
-    if (nwork) {
-      enum { GridMaxComputeCapability_2x = 0x0ffff };
-
-      const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
-
-      const int grid_max =
-          (block_size * block_size) < GridMaxComputeCapability_2x
-              ? (block_size * block_size)
-              : GridMaxComputeCapability_2x;
-
-      // At most 'max_grid' blocks:
-      const int max_grid =
-          std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
-
-      // How much work per block:
-      const int work_per_block = (nwork + max_grid - 1) / max_grid;
-
-      // How many block are really needed for this much work:
-      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
-
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(m_functor) * grid_x);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
-
-      dim3 grid(grid_x, 1, 1);
-      dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
-
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-      if (m_run_serial) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      } else {
-#endif
-        m_final = false;
-        CudaParallelLaunch<ParallelScan, LaunchBounds>(
-            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-      }
-#endif
-      m_final = true;
-      CudaParallelLaunch<ParallelScan, LaunchBounds>(
-          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-    }
-  }
-
-  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_final(false)
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-        ,
-        m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
-#endif
-  {
-  }
-};
-
-//----------------------------------------------------------------------------
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Cuda> {
- public:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
- private:
-  using Member       = typename Policy::member_type;
-  using WorkTag      = typename Policy::work_tag;
-  using WorkRange    = typename Policy::WorkRange;
-  using LaunchBounds = typename Policy::launch_bounds;
-
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
- public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using functor_type   = FunctorType;
-  using size_type      = Cuda::size_type;
-
- private:
-  // Algorithmic constraints:
-  //  (a) blockDim.y is a power of two
-  //  (b) blockDim.y == blockDim.z == 1
-  //  (c) gridDim.x  <= blockDim.y * blockDim.y
-  //  (d) gridDim.y  == gridDim.z == 1
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type m_final;
-  ReturnType& m_returnvalue;
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-  bool m_run_serial;
-#endif
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
-    m_functor(i, update, final_result);
-  }
-
-  template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
-    m_functor(TagType(), i, update, final_result);
-  }
-
-  //----------------------------------------
-
-  __device__ inline void initial() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
-
-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
-        word_count.value * threadIdx.y;
-
-    ValueInit::init(m_functor, shared_value);
-
-    // Number of blocks is bounded so that the reduction can be limited to two
-    // passes. Each thread block is given an approximately equal amount of work
-    // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
-
-    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
-         iwork < iwork_end; iwork += blockDim.y) {
-      this->template exec_range<WorkTag>(
-          iwork, ValueOps::reference(shared_value), false);
-    }
-
-    // Reduce and scan, writing out scan of blocks' totals and block-groups'
-    // totals. Blocks' scan values are written to 'blockIdx.x' location.
-    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
-    // gridDim.x
-    cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>(
-        m_functor, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
-        m_scratch_flags);
-  }
-
-  //----------------------------------------
-
-  __device__ inline void final() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
-
-    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
-    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
-        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
-        shared_data + word_count.value * (blockDim.y + 1);
-
-    // Starting value for this thread block is the previous block's total.
-    if (blockIdx.x) {
-      size_type* const block_total =
-          m_scratch_space + word_count.value * (blockIdx.x - 1);
-      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-        shared_accum[i] = block_total[i];
-      }
-    } else if (0 == threadIdx.y) {
-      ValueInit::init(m_functor, shared_accum);
-    }
-
-    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-    for (typename Policy::member_type iwork_base = range.begin();
-         iwork_base < range.end(); iwork_base += blockDim.y) {
-      unsigned MASK = __activemask();
-
-      const typename Policy::member_type iwork = iwork_base + threadIdx.y;
-
-      __syncthreads();  // Don't overwrite previous iteration values until they
-                        // are used
-
-      ValueInit::init(m_functor, shared_prefix + word_count.value);
-
-      // Copy previous block's accumulation total into thread[0] prefix and
-      // inclusive scan value of this block
-      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
-      }
-
-      __syncwarp(MASK);
-      if (CudaTraits::WarpSize < word_count.value) {
-        __syncthreads();
-      }  // Protect against large scan values.
-
-      // Call functor to accumulate inclusive scan value for this work item
-      if (iwork < range.end()) {
-        this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix + word_count.value),
-            false);
-      }
-
-      // Scan block values into locations shared_data[1..blockDim.y]
-      cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>(
-          m_functor,
-          typename ValueTraits::pointer_type(shared_data + word_count.value));
-
-      {
-        size_type* const block_total =
-            shared_data + word_count.value * blockDim.y;
-        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
-          shared_accum[i] = block_total[i];
-        }
-      }
-
-      // Call functor with exclusive scan value
-      if (iwork < range.end()) {
-        this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix), true);
-      }
-    }
-  }
-
- public:
-  Policy const& get_policy() const { return m_policy; }
-
-  //----------------------------------------
-
-  __device__ inline void operator()() const {
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    if (m_run_serial) {
-      typename ValueTraits::value_type value;
-      ValueInit::init(m_functor, (void*)&value);
-      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
-
-      for (typename Policy::member_type iwork_base = range.begin();
-           iwork_base < range.end(); iwork_base++) {
-        this->template exec_range<WorkTag>(iwork_base, value, true);
-      }
-      *((typename ValueTraits::value_type*)m_scratch_space) = value;
-    } else {
-#endif
-      if (!m_final) {
-        initial();
-      } else {
-        final();
-      }
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-    }
-#endif
-  }
-
-  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
-    // (16 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
-    // testing
-
-    unsigned n = CudaTraits::WarpSize * 4;
-    while (n &&
-           unsigned(m_policy.space()
-                        .impl_internal_space_instance()
-                        ->m_maxShmemPerBlock) <
-               cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                         WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
-
-  inline void execute() {
-    const auto nwork = m_policy.end() - m_policy.begin();
-    if (nwork) {
-      enum { GridMaxComputeCapability_2x = 0x0ffff };
-
-      const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
-
-      const int grid_max =
-          (block_size * block_size) < GridMaxComputeCapability_2x
-              ? (block_size * block_size)
-              : GridMaxComputeCapability_2x;
-
-      // At most 'max_grid' blocks:
-      const int max_grid =
-          std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
-
-      // How much work per block:
-      const int work_per_block = (nwork + max_grid - 1) / max_grid;
-
-      // How many block are really needed for this much work:
-      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
-
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(m_functor) * grid_x);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
-
-      dim3 grid(grid_x, 1, 1);
-      dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
-
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-      if (m_run_serial) {
-        block = dim3(1, 1, 1);
-        grid  = dim3(1, 1, 1);
-      } else {
-#endif
-
-        m_final = false;
-        CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-      }
-#endif
-      m_final = true;
-      CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-
-      const int size = ValueTraits::value_size(m_functor);
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-      if (m_run_serial)
-        DeepCopy<HostSpace, CudaSpace>(&m_returnvalue, m_scratch_space, size);
-      else
-#endif
-        DeepCopy<HostSpace, CudaSpace>(
-            &m_returnvalue, m_scratch_space + (grid_x - 1) * size / sizeof(int),
-            size);
-    }
-  }
-
-  ParallelScanWithTotal(const FunctorType& arg_functor,
-                        const Policy& arg_policy, ReturnType& arg_returnvalue)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr),
-        m_final(false),
-        m_returnvalue(arg_returnvalue)
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-        ,
-        m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
-#endif
-  {
-  }
-};
-
-}  // namespace Impl
-
-}  // namespace Kokkos
-
-#endif /* defined(KOKKOS_ENABLE_CUDA) */
-#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
new file mode 100644
index 000000000..e586bb4cc
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -0,0 +1,477 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_MD_RANGE_HPP
+#define KOKKOS_CUDA_PARALLEL_MD_RANGE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_CUDA)
+
+#include <algorithm>
+#include <string>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+#include <Kokkos_MinMaxClamp.hpp>
+
+#include <impl/Kokkos_Tools.hpp>
+#include <typeinfo>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+#include <impl/KokkosExp_IterateTileGPU.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+ public:
+  using Policy       = Kokkos::MDRangePolicy<Traits...>;
+  using functor_type = FunctorType;
+
+ private:
+  using RP               = Policy;
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;
+  using LaunchBounds     = typename Policy::launch_bounds;
+
+  const FunctorType m_functor;
+  const Policy m_rp;
+
+ public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelFor,
+                           LaunchBounds>::get_cuda_func_attributes();
+    auto const& prop = pol.space().cuda_device_prop();
+    // Limits due to registers/SM, MDRange doesn't have
+    // shared memory constraints
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+  }
+  Policy const& get_policy() const { return m_rp; }
+  inline __device__ void operator()() const {
+    Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
+                                    typename Policy::work_tag>(m_rp, m_functor)
+        .exec_range();
+  }
+
+  inline void execute() const {
+    if (m_rp.m_num_tiles == 0) return;
+    const auto maxblocks = cuda_internal_maximum_grid_count();
+    if (RP::rank == 2) {
+      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
+      const dim3 grid(
+          std::min<array_index_type>(
+              (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
+              maxblocks[0]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
+              maxblocks[1]),
+          1);
+      CudaParallelLaunch<ParallelFor, LaunchBounds>(
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          false);
+    } else if (RP::rank == 3) {
+      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
+      const dim3 grid(
+          std::min<array_index_type>(
+              (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
+              maxblocks[0]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
+              maxblocks[1]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
+              maxblocks[2]));
+      CudaParallelLaunch<ParallelFor, LaunchBounds>(
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          false);
+    } else if (RP::rank == 4) {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
+      // threadIdx.z
+      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
+                       m_rp.m_tile[3]);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
+      const dim3 grid(
+          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
+                                     maxblocks[0]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
+              maxblocks[1]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
+              maxblocks[2]));
+      CudaParallelLaunch<ParallelFor, LaunchBounds>(
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          false);
+    } else if (RP::rank == 5) {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
+      // threadIdx.z
+      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
+                       m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
+      KOKKOS_ASSERT(block.z > 0);
+      const dim3 grid(
+          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
+                                     maxblocks[0]),
+          std::min<array_index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3],
+                                     maxblocks[1]),
+          std::min<array_index_type>(
+              (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
+              maxblocks[2]));
+      CudaParallelLaunch<ParallelFor, LaunchBounds>(
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          false);
+    } else if (RP::rank == 6) {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
+      // threadIdx.z
+      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
+                       m_rp.m_tile[2] * m_rp.m_tile[3],
+                       m_rp.m_tile[4] * m_rp.m_tile[5]);
+      const dim3 grid(
+          std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1],
+                                     maxblocks[0]),
+          std::min<array_index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3],
+                                     maxblocks[1]),
+          std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
+                                     maxblocks[2]));
+      CudaParallelLaunch<ParallelFor, LaunchBounds>(
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          false);
+    } else {
+      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+    }
+
+  }  // end execute
+
+  //  inline
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+      : m_functor(arg_functor), m_rp(arg_policy) {}
+};
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+                     Kokkos::Cuda> {
+ public:
+  using Policy = Kokkos::MDRangePolicy<Traits...>;
+
+ private:
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;
+
+  using WorkTag      = typename Policy::work_tag;
+  using Member       = typename Policy::member_type;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
+                                    ReducerTypeFwd>;
+
+ public:
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;
+  using reducer_type   = ReducerType;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
+  // blockDim.z == 1
+
+  const FunctorType m_functor;
+  const Policy m_policy;  // used for workrange and nwork
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  size_type* m_scratch_space;
+  size_type* m_scratch_flags;
+  size_type* m_unified_space;
+
+  using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
+      Policy::rank, Policy, FunctorType, typename Policy::work_tag,
+      reference_type>;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized
+  // types of more than 128bit
+  static constexpr bool UseShflReduction = false;
+  //((sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize)
+  // Some crutch to do function overloading
+
+ public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy& pol, const Functor&) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelReduce,
+                           LaunchBounds>::get_cuda_func_attributes();
+    auto const& prop = pol.space().cuda_device_prop();
+    // Limits due do registers/SM
+    int const regs_per_sm        = prop.regsPerMultiprocessor;
+    int const regs_per_thread    = attr.numRegs;
+    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+    return std::min(
+        max_threads_per_sm,
+        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+  }
+  Policy const& get_policy() const { return m_policy; }
+  inline __device__ void exec_range(reference_type update) const {
+    Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType,
+                                            typename Policy::work_tag,
+                                            reference_type>(m_policy, m_functor,
+                                                            update)
+        .exec_range();
+  }
+
+  inline __device__ void operator()() const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(
+                       ReducerConditional::select(m_functor, m_reducer)) /
+                   sizeof(size_type));
+
+    {
+      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
+          kokkos_impl_cuda_shared_memory<size_type>() +
+          threadIdx.y * word_count.value));
+
+      // Number of blocks is bounded so that the reduction can be limited to two
+      // passes. Each thread block is given an approximately equal amount of
+      // work to perform. Accumulate the values for this block. The accumulation
+      // ordering does not match the final pass, but is arithmatically
+      // equivalent.
+
+      this->exec_range(value);
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Problem: non power-of-two blockDim
+    if (cuda_single_inter_block_reduce_scan<false>(
+            final_reducer, blockIdx.x, gridDim.x,
+            kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+            m_scratch_flags)) {
+      // This is the final block with the final result at the final threads'
+      // location
+      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
+                                (blockDim.y - 1) * word_count.value;
+      size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<size_type*>(m_result_ptr)
+              : (m_unified_space ? m_unified_space : m_scratch_space);
+
+      if (threadIdx.y == 0) {
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
+      }
+
+      if (CudaTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }
+
+      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
+        global[i] = shared[i];
+      }
+    }
+  }
+
+  // Determine block size constrained by shared memory:
+  inline unsigned local_block_size(const FunctorType& f) {
+    unsigned n = CudaTraits::WarpSize * 8;
+    int shmem_size =
+        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+            f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type,
+                           LaunchBounds>::get_cuda_func_attributes();
+    while (
+        (n &&
+         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
+          shmem_size)) ||
+        (n >
+         static_cast<unsigned>(
+             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
+      n >>= 1;
+      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                             WorkTag>(f, n);
+    }
+    return n;
+  }
+
+  inline void execute() {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const auto nwork = m_policy.m_num_tiles;
+    if (nwork) {
+      int block_size = m_policy.m_prod_tile_dims;
+      // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
+      // Nearest power of two
+      int exponent_pow_two    = std::ceil(std::log2(block_size));
+      block_size              = std::pow(2, exponent_pow_two);
+      int suggested_blocksize = local_block_size(m_functor);
+
+      block_size = (block_size > suggested_blocksize)
+                       ? block_size
+                       : suggested_blocksize;  // Note: block_size must be less
+                                               // than or equal to 512
+
+      m_scratch_space = cuda_internal_scratch_space(
+          m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                m_functor, m_reducer)) *
+                                block_size /* block_size == max block_count */);
+      m_scratch_flags =
+          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
+      m_unified_space = cuda_internal_scratch_unified(
+          m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                m_functor, m_reducer)));
+
+      // REQUIRED ( 1 , N , 1 )
+      const dim3 block(1, block_size, 1);
+      // Required grid.x <= block.y
+      const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1);
+
+      // TODO @graph We need to effectively insert this in to the graph
+      const int shmem =
+          UseShflReduction
+              ? 0
+              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                          WorkTag>(m_functor,
+                                                                   block.y);
+
+      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+          *this, grid, block, shmem,
+          m_policy.space().impl_internal_space_instance(),
+          false);  // copy to device and execute
+
+      if (!m_result_ptr_device_accessible) {
+        if (m_result_ptr) {
+          if (m_unified_space) {
+            m_policy.space().fence(
+                "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+                "Result Not Device Accessible");
+
+            const int count = Analysis::value_count(
+                ReducerConditional::select(m_functor, m_reducer));
+            for (int i = 0; i < count; ++i) {
+              m_result_ptr[i] = pointer_type(m_unified_space)[i];
+            }
+          } else {
+            const int size = Analysis::value_size(
+                ReducerConditional::select(m_functor, m_reducer));
+            DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr,
+                                                 m_scratch_space, size);
+          }
+        }
+      }
+    } else {
+      if (m_result_ptr) {
+        // TODO @graph We need to effectively insert this in to the graph
+        final_reducer.init(m_result_ptr);
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(
+      const FunctorType& arg_functor, const Policy& arg_policy,
+      const ViewType& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {
+    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+  }
+
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {
+    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+  }
+};
+}  // namespace Impl
+}  // namespace Kokkos
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
new file mode 100644
index 000000000..987334300
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -0,0 +1,1049 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_RANGE_HPP
+#define KOKKOS_CUDA_PARALLEL_RANGE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_CUDA)
+
+#include <algorithm>
+#include <string>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+#include <Kokkos_MinMaxClamp.hpp>
+
+#include <impl/Kokkos_Tools.hpp>
+#include <typeinfo>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  ParallelFor()        = delete;
+  ParallelFor& operator=(const ParallelFor&) = delete;
+
+  template <class TagType>
+  inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member i) const {
+    m_functor(i);
+  }
+
+  template <class TagType>
+  inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member i) const {
+    m_functor(TagType(), i);
+  }
+
+ public:
+  using functor_type = FunctorType;
+
+  Policy const& get_policy() const { return m_policy; }
+
+  inline __device__ void operator()() const {
+    const Member work_stride = blockDim.y * gridDim.x;
+    const Member work_end    = m_policy.end();
+
+    for (Member iwork =
+             m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x;
+         iwork < work_end;
+         iwork = iwork < work_end - work_stride ? iwork + work_stride
+                                                : work_end) {
+      this->template exec_range<WorkTag>(iwork);
+    }
+  }
+
+  inline void execute() const {
+    const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
+
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelFor,
+                           LaunchBounds>::get_cuda_func_attributes();
+    const int block_size =
+        Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
+            0, 0);
+    KOKKOS_ASSERT(block_size > 0);
+    dim3 block(1, block_size, 1);
+    dim3 grid(
+        std::min(
+            typename Policy::index_type((nwork + block.y - 1) / block.y),
+            typename Policy::index_type(cuda_internal_maximum_grid_count()[0])),
+        1, 1);
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
+      block = dim3(1, 1, 1);
+      grid  = dim3(1, 1, 1);
+    }
+#endif
+
+    CudaParallelLaunch<ParallelFor, LaunchBounds>(
+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
+        false);
+  }
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+                     Kokkos::Cuda> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using WorkRange    = typename Policy::WorkRange;
+  using WorkTag      = typename Policy::work_tag;
+  using Member       = typename Policy::member_type;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
+                                    ReducerTypeFwd>;
+
+ public:
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
+  using functor_type   = FunctorType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>,
+      Kokkos::Cuda::size_type>;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
+  // blockDim.z == 1
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  const bool m_result_ptr_host_accessible;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
+
+  // FIXME_CUDA Shall we use the shfl based reduction or not (only use it for
+  // static sized types of more than 128bit:
+  // sizeof(value_type)>2*sizeof(double)) && Analysis::StaticValueSize)
+  static constexpr bool UseShflReduction = false;
+
+ public:
+  Policy const& get_policy() const { return m_policy; }
+
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
+    m_functor(i, update);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
+    m_functor(TagType(), i, update);
+  }
+
+  __device__ inline void operator()() const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(
+                       ReducerConditional::select(m_functor, m_reducer)) /
+                   sizeof(word_size_type));
+
+    {
+      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          threadIdx.y * word_count.value));
+
+      // Number of blocks is bounded so that the reduction can be limited to two
+      // passes. Each thread block is given an approximately equal amount of
+      // work to perform. Accumulate the values for this block. The accumulation
+      // ordering does not match the final pass, but is arithmatically
+      // equivalent.
+
+      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+      for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
+           iwork < iwork_end; iwork += blockDim.y) {
+        this->template exec_range<WorkTag>(iwork, value);
+      }
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Shortcut for length zero reduction
+    bool zero_length        = m_policy.begin() == m_policy.end();
+    bool do_final_reduction = true;
+    if (!zero_length)
+      do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
+          final_reducer, blockIdx.x, gridDim.x,
+          kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
+          m_scratch_flags);
+
+    if (do_final_reduction) {
+      // This is the final block with the final result at the final threads'
+      // location
+
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : (m_unified_space ? m_unified_space : m_scratch_space);
+
+      if (threadIdx.y == 0) {
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
+      }
+
+      if (CudaTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }
+
+      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
+        global[i] = shared[i];
+      }
+    }
+  }
+
+  // Determine block size constrained by shared memory:
+  inline unsigned local_block_size(const FunctorType& f) {
+    unsigned n = CudaTraits::WarpSize * 8;
+    int shmem_size =
+        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
+            f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type,
+                           LaunchBounds>::get_cuda_func_attributes();
+    while (
+        (n &&
+         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
+          shmem_size)) ||
+        (n >
+         static_cast<unsigned>(
+             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
+      n >>= 1;
+      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                             WorkTag>(f, n);
+    }
+    return n;
+  }
+
+  inline void execute() {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const index_type nwork     = m_policy.end() - m_policy.begin();
+    const bool need_device_set = Analysis::has_init_member_function ||
+                                 Analysis::has_final_member_function ||
+                                 !m_result_ptr_host_accessible ||
+#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
+                                 Policy::is_graph_kernel::value ||
+#endif
+                                 !std::is_same<ReducerType, InvalidType>::value;
+    if ((nwork > 0) || need_device_set) {
+      const int block_size = local_block_size(m_functor);
+
+      KOKKOS_ASSERT(block_size > 0);
+
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
+          m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                m_functor, m_reducer)) *
+                                block_size /* block_size == max block_count */);
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                    m_functor, m_reducer))));
+
+      // REQUIRED ( 1 , N , 1 )
+      dim3 block(1, block_size, 1);
+      // Required grid.x <= block.y
+      dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1,
+                1);
+
+      // TODO @graph We need to effectively insert this in to the graph
+      const int shmem =
+          UseShflReduction
+              ? 0
+              : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                          WorkTag>(m_functor,
+                                                                   block.y);
+
+      if ((nwork == 0)
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+          || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
+#endif
+      ) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      }
+
+      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+          *this, grid, block, shmem,
+          m_policy.space().impl_internal_space_instance(),
+          false);  // copy to device and execute
+
+      if (!m_result_ptr_device_accessible) {
+        if (m_result_ptr) {
+          if (m_unified_space) {
+            m_policy.space().fence(
+                "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: "
+                "Result "
+                "Not Device Accessible");
+            const int count = Analysis::value_count(
+                ReducerConditional::select(m_functor, m_reducer));
+            for (int i = 0; i < count; ++i) {
+              m_result_ptr[i] = pointer_type(m_unified_space)[i];
+            }
+          } else {
+            const int size = Analysis::value_size(
+                ReducerConditional::select(m_functor, m_reducer));
+            DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr,
+                                                 m_scratch_space, size);
+          }
+        }
+      }
+    } else {
+      if (m_result_ptr) {
+        // TODO @graph We need to effectively insert this in to the graph
+        final_reducer.init(m_result_ptr);
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(
+      const FunctorType& arg_functor, const Policy& arg_policy,
+      const ViewType& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {
+    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+  }
+
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {
+    check_reduced_view_shmem_size<WorkTag>(m_policy, m_functor);
+  }
+};
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using WorkRange    = typename Policy::WorkRange;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
+                                                 Policy, FunctorType>;
+
+ public:
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;
+
+ private:
+  // Algorithmic constraints:
+  //  (a) blockDim.y is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.y * blockDim.y
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  size_type* m_scratch_space;
+  size_type* m_scratch_flags;
+  size_type m_final;
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+  bool m_run_serial;
+#endif
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
+    m_functor(i, update, final_result);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
+    m_functor(TagType(), i, update, final_result);
+  }
+
+  //----------------------------------------
+
+  __device__ inline void initial() const {
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+
+    size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<size_type>() +
+        word_count.value * threadIdx.y;
+
+    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
+
+    // Number of blocks is bounded so that the reduction can be limited to two
+    // passes. Each thread block is given an approximately equal amount of work
+    // to perform. Accumulate the values for this block. The accumulation
+    // ordering does not match the final pass, but is arithmatically equivalent.
+
+    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
+         iwork < iwork_end; iwork += blockDim.y) {
+      this->template exec_range<WorkTag>(
+          iwork,
+          final_reducer.reference(reinterpret_cast<pointer_type>(shared_value)),
+          false);
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups'
+    // totals. Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
+    // gridDim.x
+    cuda_single_inter_block_reduce_scan<true>(
+        final_reducer, blockIdx.x, gridDim.x,
+        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        m_scratch_flags);
+  }
+
+  //----------------------------------------
+
+  __device__ inline void final() const {
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
+    // value[2] , ... }
+    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type* const shared_prefix =
+        shared_data + word_count.value * threadIdx.y;
+    size_type* const shared_accum =
+        shared_data + word_count.value * (blockDim.y + 1);
+
+    // Starting value for this thread block is the previous block's total.
+    if (blockIdx.x) {
+      size_type* const block_total =
+          m_scratch_space + word_count.value * (blockIdx.x - 1);
+      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+        shared_accum[i] = block_total[i];
+      }
+    } else if (0 == threadIdx.y) {
+      final_reducer.init(reinterpret_cast<pointer_type>(shared_accum));
+    }
+
+    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+    for (typename Policy::member_type iwork_base = range.begin();
+         iwork_base < range.end(); iwork_base += blockDim.y) {
+      unsigned MASK                            = __activemask();
+      const typename Policy::member_type iwork = iwork_base + threadIdx.y;
+
+      __syncthreads();  // Don't overwrite previous iteration values until they
+                        // are used
+
+      final_reducer.init(
+          reinterpret_cast<pointer_type>(shared_prefix + word_count.value));
+
+      // Copy previous block's accumulation total into thread[0] prefix and
+      // inclusive scan value of this block
+      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
+      }
+      __syncwarp(MASK);
+      if (CudaTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }  // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if (iwork < range.end()) {
+        this->template exec_range<WorkTag>(
+            iwork,
+            final_reducer.reference(reinterpret_cast<pointer_type>(
+                shared_prefix + word_count.value)),
+            false);
+      }
+
+      // Scan block values into locations shared_data[1..blockDim.y]
+      cuda_intra_block_reduce_scan<true>(
+          final_reducer,
+          typename Analysis::pointer_type(shared_data + word_count.value));
+
+      {
+        size_type* const block_total =
+            shared_data + word_count.value * blockDim.y;
+        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+          shared_accum[i] = block_total[i];
+        }
+      }
+
+      // Call functor with exclusive scan value
+      if (iwork < range.end()) {
+        this->template exec_range<WorkTag>(
+            iwork,
+            final_reducer.reference(
+                reinterpret_cast<pointer_type>(shared_prefix)),
+            true);
+      }
+    }
+  }
+
+ public:
+  Policy const& get_policy() const { return m_policy; }
+
+  //----------------------------------------
+
+  __device__ inline void operator()() const {
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    if (m_run_serial) {
+      typename Analysis::value_type value;
+      ValueInit::init(m_functor, (void*)&value);
+      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+      for (typename Policy::member_type iwork_base = range.begin();
+           iwork_base < range.end(); iwork_base++) {
+        this->template exec_range<WorkTag>(iwork_base, value, true);
+      }
+    } else {
+#endif
+      if (!m_final) {
+        initial();
+      } else {
+        final();
+      }
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    }
+#endif
+  }
+
+  // Determine block size constrained by shared memory:
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
+    // (16 warps) gridDim.x <= blockDim.y * blockDim.y
+    //
+    // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
+    // testing
+
+    unsigned n = CudaTraits::WarpSize * 4;
+    while (n && unsigned(m_policy.space()
+                             .impl_internal_space_instance()
+                             ->m_maxShmemPerBlock) <
+                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
+                                                              WorkTag>(f, n)) {
+      n >>= 1;
+    }
+    return n;
+  }
+
+  inline void execute() {
+    const auto nwork = m_policy.end() - m_policy.begin();
+    if (nwork) {
+      constexpr int GridMaxComputeCapability_2x = 0x0ffff;
+
+      const int block_size = local_block_size(m_functor);
+      KOKKOS_ASSERT(block_size > 0);
+
+      const int grid_max =
+          (block_size * block_size) < GridMaxComputeCapability_2x
+              ? (block_size * block_size)
+              : GridMaxComputeCapability_2x;
+
+      // At most 'max_grid' blocks:
+      const int max_grid =
+          std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
+
+      // How much work per block:
+      const int work_per_block = (nwork + max_grid - 1) / max_grid;
+
+      // How many block are really needed for this much work:
+      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
+
+      m_scratch_space = cuda_internal_scratch_space(
+          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_flags =
+          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
+
+      dim3 grid(grid_x, 1, 1);
+      dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
+      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
+
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+      if (m_run_serial) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      } else {
+#endif
+        m_final = false;
+        CudaParallelLaunch<ParallelScan, LaunchBounds>(
+            *this, grid, block, shmem,
+            m_policy.space().impl_internal_space_instance(),
+            false);  // copy to device and execute
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+      }
+#endif
+      m_final = true;
+      CudaParallelLaunch<ParallelScan, LaunchBounds>(
+          *this, grid, block, shmem,
+          m_policy.space().impl_internal_space_instance(),
+          false);  // copy to device and execute
+    }
+  }
+
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_final(false)
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+        ,
+        m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
+#endif
+  {
+  }
+};
+
+//----------------------------------------------------------------------------
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Cuda> {
+ public:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using WorkRange    = typename Policy::WorkRange;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
+                                                 Policy, FunctorType>;
+
+ public:
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;
+
+ private:
+  // Algorithmic constraints:
+  //  (a) blockDim.y is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.y * blockDim.y
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  size_type* m_scratch_space;
+  size_type* m_scratch_flags;
+  size_type m_final;
+  ReturnType& m_returnvalue;
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+  bool m_run_serial;
+#endif
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
+    m_functor(i, update, final_result);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
+    m_functor(TagType(), i, update, final_result);
+  }
+
+  //----------------------------------------
+
+  __device__ inline void initial() const {
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+
+    size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<size_type>() +
+        word_count.value * threadIdx.y;
+
+    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
+
+    // Number of blocks is bounded so that the reduction can be limited to two
+    // passes. Each thread block is given an approximately equal amount of work
+    // to perform. Accumulate the values for this block. The accumulation
+    // ordering does not match the final pass, but is arithmatically equivalent.
+
+    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+    for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
+         iwork < iwork_end; iwork += blockDim.y) {
+      this->template exec_range<WorkTag>(
+          iwork,
+          final_reducer.reference(reinterpret_cast<pointer_type>(shared_value)),
+          false);
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups'
+    // totals. Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
+    // gridDim.x
+    cuda_single_inter_block_reduce_scan<true>(
+        final_reducer, blockIdx.x, gridDim.x,
+        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        m_scratch_flags);
+  }
+
+  //----------------------------------------
+
+  __device__ inline void final() const {
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
+    // value[2] , ... }
+    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type* const shared_prefix =
+        shared_data + word_count.value * threadIdx.y;
+    size_type* const shared_accum =
+        shared_data + word_count.value * (blockDim.y + 1);
+
+    // Starting value for this thread block is the previous block's total.
+    if (blockIdx.x) {
+      size_type* const block_total =
+          m_scratch_space + word_count.value * (blockIdx.x - 1);
+      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+        shared_accum[i] = block_total[i];
+      }
+    } else if (0 == threadIdx.y) {
+      final_reducer.init(reinterpret_cast<pointer_type>(shared_accum));
+    }
+
+    const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+    for (typename Policy::member_type iwork_base = range.begin();
+         iwork_base < range.end(); iwork_base += blockDim.y) {
+      unsigned MASK = __activemask();
+
+      const typename Policy::member_type iwork = iwork_base + threadIdx.y;
+
+      __syncthreads();  // Don't overwrite previous iteration values until they
+                        // are used
+
+      final_reducer.init(
+          reinterpret_cast<pointer_type>(shared_prefix + word_count.value));
+
+      // Copy previous block's accumulation total into thread[0] prefix and
+      // inclusive scan value of this block
+      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
+      }
+
+      __syncwarp(MASK);
+      if (CudaTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }  // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if (iwork < range.end()) {
+        this->template exec_range<WorkTag>(
+            iwork,
+            final_reducer.reference(reinterpret_cast<pointer_type>(
+                shared_prefix + word_count.value)),
+            false);
+      }
+
+      // Scan block values into locations shared_data[1..blockDim.y]
+      cuda_intra_block_reduce_scan<true>(
+          final_reducer,
+          typename Analysis::pointer_type(shared_data + word_count.value));
+
+      {
+        size_type* const block_total =
+            shared_data + word_count.value * blockDim.y;
+        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
+          shared_accum[i] = block_total[i];
+        }
+      }
+
+      // Call functor with exclusive scan value
+      if (iwork < range.end()) {
+        this->template exec_range<WorkTag>(
+            iwork,
+            final_reducer.reference(
+                reinterpret_cast<pointer_type>(shared_prefix)),
+            true);
+      }
+    }
+  }
+
+ public:
+  Policy const& get_policy() const { return m_policy; }
+
+  //----------------------------------------
+
+  __device__ inline void operator()() const {
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    if (m_run_serial) {
+      typename Analysis::value_type value;
+      ValueInit::init(m_functor, (void*)&value);
+      const WorkRange range(m_policy, blockIdx.x, gridDim.x);
+
+      for (typename Policy::member_type iwork_base = range.begin();
+           iwork_base < range.end(); iwork_base++) {
+        this->template exec_range<WorkTag>(iwork_base, value, true);
+      }
+      *((typename Analysis::value_type*)m_scratch_space) = value;
+    } else {
+#endif
+      if (!m_final) {
+        initial();
+      } else {
+        final();
+      }
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    }
+#endif
+  }
+
+  // Determine block size constrained by shared memory:
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
+    // (16 warps) gridDim.x <= blockDim.y * blockDim.y
+    //
+    // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
+    // testing
+
+    unsigned n = CudaTraits::WarpSize * 4;
+    while (n && unsigned(m_policy.space()
+                             .impl_internal_space_instance()
+                             ->m_maxShmemPerBlock) <
+                    cuda_single_inter_block_reduce_scan_shmem<true, FunctorType,
+                                                              WorkTag>(f, n)) {
+      n >>= 1;
+    }
+    return n;
+  }
+
+  inline void execute() {
+    const auto nwork = m_policy.end() - m_policy.begin();
+    if (nwork) {
+      enum { GridMaxComputeCapability_2x = 0x0ffff };
+
+      const int block_size = local_block_size(m_functor);
+      KOKKOS_ASSERT(block_size > 0);
+
+      const int grid_max =
+          (block_size * block_size) < GridMaxComputeCapability_2x
+              ? (block_size * block_size)
+              : GridMaxComputeCapability_2x;
+
+      // At most 'max_grid' blocks:
+      const int max_grid =
+          std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
+
+      // How much work per block:
+      const int work_per_block = (nwork + max_grid - 1) / max_grid;
+
+      // How many block are really needed for this much work:
+      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
+
+      m_scratch_space = cuda_internal_scratch_space(
+          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_flags =
+          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
+
+      dim3 grid(grid_x, 1, 1);
+      dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
+      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
+
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+      if (m_run_serial) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      } else {
+#endif
+
+        m_final = false;
+        CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
+            *this, grid, block, shmem,
+            m_policy.space().impl_internal_space_instance(),
+            false);  // copy to device and execute
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+      }
+#endif
+      m_final = true;
+      CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
+          *this, grid, block, shmem,
+          m_policy.space().impl_internal_space_instance(),
+          false);  // copy to device and execute
+
+      const int size = Analysis::value_size(m_functor);
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+      if (m_run_serial)
+        DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), &m_returnvalue,
+                                             m_scratch_space, size);
+      else
+#endif
+        DeepCopy<HostSpace, CudaSpace, Cuda>(
+            m_policy.space(), &m_returnvalue,
+            m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
+    }
+  }
+
+  ParallelScanWithTotal(const FunctorType& arg_functor,
+                        const Policy& arg_policy, ReturnType& arg_returnvalue)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_final(false),
+        m_returnvalue(arg_returnvalue)
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+        ,
+        m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
+#endif
+  {
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
new file mode 100644
index 000000000..cdd16085b
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -0,0 +1,1139 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_TEAM_HPP
+#define KOKKOS_CUDA_PARALLEL_TEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_CUDA)
+
+#include <algorithm>
+#include <string>
+#include <cstdio>
+#include <cstdint>
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Team.hpp>
+#include <Kokkos_MinMaxClamp.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#include <impl/Kokkos_Tools.hpp>
+#include <typeinfo>
+
+#include <impl/KokkosExp_IterateTileGPU.hpp>
+
+namespace Kokkos {
+
+extern bool show_warnings() noexcept;
+
+namespace Impl {
+
+template <class... Properties>
+class TeamPolicyInternal<Kokkos::Cuda, Properties...>
+    : public PolicyTraits<Properties...> {
+ public:
+  //! Tag this class as a kokkos execution policy
+  using execution_policy = TeamPolicyInternal;
+
+  using traits = PolicyTraits<Properties...>;
+
+  template <class ExecSpace, class... OtherProperties>
+  friend class TeamPolicyInternal;
+
+ private:
+  static constexpr int MAX_WARP = 8;
+
+  typename traits::execution_space m_space;
+  int m_league_size;
+  int m_team_size;
+  int m_vector_length;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+  int m_chunk_size;
+  bool m_tune_team;
+  bool m_tune_vector;
+
+ public:
+  //! Execution space of this execution policy
+  using execution_space = Kokkos::Cuda;
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) {
+    m_league_size            = p.m_league_size;
+    m_team_size              = p.m_team_size;
+    m_vector_length          = p.m_vector_length;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+    m_space                  = p.m_space;
+    m_tune_team              = p.m_tune_team;
+    m_tune_vector            = p.m_tune_vector;
+  }
+
+  //----------------------------------------
+
+  template <class FunctorType>
+  int team_size_max(const FunctorType& f, const ParallelForTag&) const {
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
+            get_cuda_func_attributes();
+    int block_size =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType,
+                                              typename traits::launch_bounds>(
+            space().impl_internal_space_instance(), attr, f,
+            (size_t)impl_vector_length(),
+            (size_t)team_scratch_size(0) + 2 * sizeof(double),
+            (size_t)thread_scratch_size(0) + sizeof(double));
+    return block_size / impl_vector_length();
+  }
+
+  template <class FunctorType>
+  inline int team_size_max(const FunctorType& f,
+                           const ParallelReduceTag&) const {
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType>;
+    using reducer_type = typename Impl::ParallelReduceReturnValue<
+        void, typename functor_analysis_type::value_type,
+        FunctorType>::reducer_type;
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             reducer_type>;
+    return internal_team_size_max<closure_type>(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+                           const ParallelReduceTag&) const {
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             ReducerType>;
+    return internal_team_size_max<closure_type>(f);
+  }
+
+  template <class FunctorType>
+  int team_size_recommended(const FunctorType& f, const ParallelForTag&) const {
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
+            get_cuda_func_attributes();
+    const int block_size =
+        Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
+                                              typename traits::launch_bounds>(
+            space().impl_internal_space_instance(), attr, f,
+            (size_t)impl_vector_length(),
+            (size_t)team_scratch_size(0) + 2 * sizeof(double),
+            (size_t)thread_scratch_size(0) + sizeof(double));
+    return block_size / impl_vector_length();
+  }
+
+  template <class FunctorType>
+  inline int team_size_recommended(const FunctorType& f,
+                                   const ParallelReduceTag&) const {
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType>;
+    using reducer_type = typename Impl::ParallelReduceReturnValue<
+        void, typename functor_analysis_type::value_type,
+        FunctorType>::reducer_type;
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             reducer_type>;
+    return internal_team_size_recommended<closure_type>(f);
+  }
+
+  template <class FunctorType, class ReducerType>
+  int team_size_recommended(const FunctorType& f, const ReducerType&,
+                            const ParallelReduceTag&) const {
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             ReducerType>;
+    return internal_team_size_recommended<closure_type>(f);
+  }
+
+  inline static int vector_length_max() { return Impl::CudaTraits::WarpSize; }
+
+  inline static int verify_requested_vector_length(
+      int requested_vector_length) {
+    int test_vector_length =
+        std::min(requested_vector_length, vector_length_max());
+
+    // Allow only power-of-two vector_length
+    if (!(is_integral_power_of_two(test_vector_length))) {
+      int test_pow2 = 1;
+      for (int i = 0; i < 5; i++) {
+        test_pow2 = test_pow2 << 1;
+        if (test_pow2 > test_vector_length) {
+          break;
+        }
+      }
+      test_vector_length = test_pow2 >> 1;
+    }
+
+    return test_vector_length;
+  }
+
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
+  }
+
+  //----------------------------------------
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  KOKKOS_DEPRECATED inline int vector_length() const {
+    return impl_vector_length();
+  }
+#endif
+  inline int impl_vector_length() const { return m_vector_length; }
+  inline int team_size() const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+  inline bool impl_auto_team_size() const { return m_tune_team; }
+  inline bool impl_auto_vector_length() const { return m_tune_vector; }
+  inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; }
+  inline void impl_set_vector_length(size_t vector_length) {
+    m_vector_length = vector_length;
+  }
+  size_t scratch_size(int level, int team_size_ = -1) const {
+    if (team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] +
+           team_size_ * m_thread_scratch_size[level];
+  }
+  size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+  size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  const typename traits::execution_space& space() const { return m_space; }
+
+  TeamPolicyInternal()
+      : m_space(typename traits::execution_space()),
+        m_league_size(0),
+        m_team_size(-1),
+        m_vector_length(0),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(Impl::CudaTraits::WarpSize),
+        m_tune_team(false),
+        m_tune_vector(false) {}
+
+  /** \brief  Specify league size, specify team size, specify vector length */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request, int vector_length_request = 1)
+      : m_space(space_),
+        m_league_size(league_size_),
+        m_team_size(team_size_request),
+        m_vector_length(
+            (vector_length_request > 0)
+                ? verify_requested_vector_length(vector_length_request)
+                : verify_requested_vector_length(1)),
+        m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_chunk_size(Impl::CudaTraits::WarpSize),
+        m_tune_team(bool(team_size_request <= 0)),
+        m_tune_vector(bool(vector_length_request <= 0)) {
+    // Make sure league size is permissible
+    if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0]))
+      Impl::throw_runtime_exception(
+          "Requested too large league_size for TeamPolicy on Cuda execution "
+          "space.");
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        int(Impl::CudaTraits::MaxHierarchicalParallelism)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
+                      "Team size x vector length must be smaller than 1024."));
+    }
+  }
+
+  /** \brief  Specify league size, request team size, specify vector length */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
+
+  /** \brief  Specify league size, request team size and vector length */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     const Kokkos::AUTO_t& /* team_size_request */,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space_, league_size_, -1, -1) {}
+
+  /** \brief  Specify league size, specify team size, request vector length */
+  TeamPolicyInternal(const execution_space space_, int league_size_,
+                     int team_size_request, const Kokkos::AUTO_t&)
+      : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {}
+
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request)
+
+  {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
+                     const Kokkos::AUTO_t& vector_length_request)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(int league_size_, int team_size_request,
+                     const Kokkos::AUTO_t& vector_length_request)
+      : TeamPolicyInternal(typename traits::execution_space(), league_size_,
+                           team_size_request, vector_length_request) {}
+
+  inline int chunk_size() const { return m_chunk_size; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal& set_chunk_size(
+      typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(const int& level,
+                                              const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerTeamValue& per_team,
+      const PerThreadValue& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  using member_type = Kokkos::Impl::CudaTeamMember;
+
+ protected:
+  template <class ClosureType, class FunctorType, class BlockSizeCallable>
+  int internal_team_size_common(const FunctorType& f,
+                                BlockSizeCallable&& block_size_callable) const {
+    using closure_type = ClosureType;
+    using Interface =
+        typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
+    using Analysis =
+        Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
+                              FunctorType>;
+
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
+            get_cuda_func_attributes();
+    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
+        space().impl_internal_space_instance(), attr, f,
+        (size_t)impl_vector_length(),
+        (size_t)team_scratch_size(0) + 2 * sizeof(double),
+        (size_t)thread_scratch_size(0) + sizeof(double) +
+            ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)));
+    KOKKOS_ASSERT(block_size > 0);
+
+    // Currently we require Power-of-2 team size for reductions.
+    int p2 = 1;
+    while (p2 <= block_size) p2 *= 2;
+    p2 /= 2;
+    return p2 / impl_vector_length();
+  }
+
+  template <class ClosureType, class FunctorType>
+  int internal_team_size_max(const FunctorType& f) const {
+    return internal_team_size_common<ClosureType>(
+        f,
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType,
+                                              typename traits::launch_bounds>);
+  }
+
+  template <class ClosureType, class FunctorType>
+  int internal_team_size_recommended(const FunctorType& f) const {
+    return internal_team_size_common<ClosureType>(
+        f,
+        Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
+                                              typename traits::launch_bounds>);
+  }
+};
+
+__device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size,
+                                                 int32_t* scratch_locks) {
+  int64_t threadid = 0;
+  __shared__ int64_t base_thread_id;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    int64_t const wraparound_len = Kokkos::max(
+        int64_t(1), Kokkos::min(int64_t(league_size),
+                                (int64_t(g_device_cuda_lock_arrays.n)) /
+                                    (blockDim.x * blockDim.y)));
+    threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
+    threadid *= blockDim.x * blockDim.y;
+    int done = 0;
+    while (!done) {
+      done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1));
+      if (!done) {
+        threadid += blockDim.x * blockDim.y;
+        if (int64_t(threadid + blockDim.x * blockDim.y) >=
+            wraparound_len * blockDim.x * blockDim.y)
+          threadid = 0;
+      }
+    }
+    base_thread_id = threadid;
+  }
+  __syncthreads();
+  threadid = base_thread_id;
+  return threadid;
+}
+
+__device__ inline void cuda_release_scratch_index(int32_t* scratch_locks,
+                                                  int64_t threadid) {
+  __syncthreads();
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    scratch_locks[threadid] = 0;
+  }
+}
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Cuda> {
+ public:
+  using Policy = TeamPolicy<Properties...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+ public:
+  using functor_type = FunctorType;
+  using size_type    = Cuda::size_type;
+
+ private:
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
+  // blockDim.z == 1 shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const size_type m_league_size;
+  int m_team_size;
+  const size_type m_vector_size;
+  int m_shmem_begin;
+  int m_shmem_size;
+  void* m_scratch_ptr[2];
+  size_t m_scratch_size[2];
+  int m_scratch_pool_id = -1;
+  int32_t* m_scratch_locks;
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const Member& member) const {
+    m_functor(member);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const Member& member) const {
+    m_functor(TagType(), member);
+  }
+
+ public:
+  Policy const& get_policy() const { return m_policy; }
+
+  __device__ inline void operator()() const {
+    // Iterate this block through the league
+    int64_t threadid = 0;
+    if (m_scratch_size[1] > 0) {
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+    }
+
+    const int int_league_size = (int)m_league_size;
+    for (int league_rank = blockIdx.x; league_rank < int_league_size;
+         league_rank += gridDim.x) {
+      this->template exec_team<WorkTag>(typename Policy::member_type(
+          kokkos_impl_cuda_shared_memory<void>(), m_shmem_begin, m_shmem_size,
+          (void*)(((char*)m_scratch_ptr[1]) +
+                  ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
+                      m_scratch_size[1]),
+          m_scratch_size[1], league_rank, m_league_size));
+    }
+    if (m_scratch_size[1] > 0) {
+      cuda_release_scratch_index(m_scratch_locks, threadid);
+    }
+  }
+
+  inline void execute() const {
+    const int64_t shmem_size_total = m_shmem_begin + m_shmem_size;
+    dim3 grid(int(m_league_size), 1, 1);
+    const dim3 block(int(m_vector_size), int(m_team_size), 1);
+
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+    if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
+      grid = dim3(1, 1, 1);
+    }
+#endif
+
+    CudaParallelLaunch<ParallelFor, LaunchBounds>(
+        *this, grid, block, shmem_size_total,
+        m_policy.space().impl_internal_space_instance(),
+        true);  // copy to device and execute
+  }
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelFor,
+                           LaunchBounds>::get_cuda_func_attributes();
+    m_team_size =
+        m_team_size >= 0
+            ? m_team_size
+            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
+                  m_policy.space().impl_internal_space_instance(), attr,
+                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
+                  m_policy.thread_scratch_size(0)) /
+                  m_vector_size;
+
+    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
+    m_shmem_size =
+        (m_policy.scratch_size(0, m_team_size) +
+         FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
+    m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks =
+        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+    m_scratch_ptr[0] = nullptr;
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (std::min(
+                      static_cast<std::int64_t>(Cuda::concurrency() /
+                                                (m_team_size * m_vector_size)),
+                      static_cast<std::int64_t>(m_league_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
+
+    const int shmem_size_total = m_shmem_begin + m_shmem_size;
+    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
+        shmem_size_total) {
+      printf(
+          "%i %i\n",
+          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
+          shmem_size_total);
+      Kokkos::Impl::throw_runtime_exception(std::string(
+          "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
+    }
+
+    if (int(m_team_size) >
+        int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+                m_policy.space().impl_internal_space_instance(), attr,
+                arg_functor, arg_policy.impl_vector_length(),
+                arg_policy.team_scratch_size(0),
+                arg_policy.thread_scratch_size(0)) /
+            arg_policy.impl_vector_length())) {
+      Kokkos::Impl::throw_runtime_exception(std::string(
+          "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
+    }
+  }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
+};
+
+template <class FunctorType, class ReducerType, class... Properties>
+class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                     ReducerType, Kokkos::Cuda> {
+ public:
+  using Policy = TeamPolicy<Properties...>;
+
+ private:
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
+                                    ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
+
+ public:
+  using functor_type = FunctorType;
+  using size_type    = Cuda::size_type;
+  using reducer_type = ReducerType;
+
+  static constexpr bool UseShflReduction =
+      (true && (Analysis::StaticValueSize != 0));
+
+ private:
+  struct ShflReductionTag {};
+  struct SHMEMReductionTag {};
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
+  // blockDim.z == 1 shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
+  const bool m_result_ptr_host_accessible;
+  size_type* m_scratch_space;
+  size_type* m_scratch_flags;
+  size_type* m_unified_space;
+  size_type m_team_begin;
+  size_type m_shmem_begin;
+  size_type m_shmem_size;
+  void* m_scratch_ptr[2];
+  size_t m_scratch_size[2];
+  int m_scratch_pool_id = -1;
+  int32_t* m_scratch_locks;
+  const size_type m_league_size;
+  int m_team_size;
+  const size_type m_vector_size;
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const Member& member, reference_type update) const {
+    m_functor(member, update);
+  }
+
+  template <class TagType>
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const Member& member, reference_type update) const {
+    m_functor(TagType(), member, update);
+  }
+
+ public:
+  Policy const& get_policy() const { return m_policy; }
+
+  __device__ inline void operator()() const {
+    int64_t threadid = 0;
+    if (m_scratch_size[1] > 0) {
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+    }
+
+    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
+                                            SHMEMReductionTag>;
+    run(ReductionTag{}, threadid);
+    if (m_scratch_size[1] > 0) {
+      cuda_release_scratch_index(m_scratch_locks, threadid);
+    }
+  }
+
+  __device__ inline void run(SHMEMReductionTag&, const int& threadid) const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+                                                   sizeof(size_type)>
+        word_count(Analysis::value_size(
+                       ReducerConditional::select(m_functor, m_reducer)) /
+                   sizeof(size_type));
+
+    reference_type value =
+        final_reducer.init(kokkos_impl_cuda_shared_memory<size_type>() +
+                           threadIdx.y * word_count.value);
+
+    // Iterate this block through the league
+    const int int_league_size = (int)m_league_size;
+    for (int league_rank = blockIdx.x; league_rank < int_league_size;
+         league_rank += gridDim.x) {
+      this->template exec_team<WorkTag>(
+          Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
+                 m_shmem_begin, m_shmem_size,
+                 (void*)(((char*)m_scratch_ptr[1]) +
+                         ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
+                             m_scratch_size[1]),
+                 m_scratch_size[1], league_rank, m_league_size),
+          value);
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    bool zero_length        = m_league_size == 0;
+    bool do_final_reduction = true;
+    if (!zero_length)
+      do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
+          final_reducer, blockIdx.x, gridDim.x,
+          kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+          m_scratch_flags);
+
+    if (do_final_reduction) {
+      // This is the final block with the final result at the final threads'
+      // location
+
+      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
+                                (blockDim.y - 1) * word_count.value;
+      size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<size_type*>(m_result_ptr)
+              : (m_unified_space ? m_unified_space : m_scratch_space);
+
+      if (threadIdx.y == 0) {
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
+      }
+
+      if (CudaTraits::WarpSize < word_count.value) {
+        __syncthreads();
+      }
+
+      for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
+        global[i] = shared[i];
+      }
+    }
+  }
+
+  __device__ inline void run(ShflReductionTag, const int& threadid) const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    value_type value;
+    final_reducer.init(&value);
+
+    // Iterate this block through the league
+    const int int_league_size = (int)m_league_size;
+    for (int league_rank = blockIdx.x; league_rank < int_league_size;
+         league_rank += gridDim.x) {
+      this->template exec_team<WorkTag>(
+          Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
+                 m_shmem_begin, m_shmem_size,
+                 (void*)(((char*)m_scratch_ptr[1]) +
+                         ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
+                             m_scratch_size[1]),
+                 m_scratch_size[1], league_rank, m_league_size),
+          value);
+    }
+
+    pointer_type const result =
+        m_result_ptr_device_accessible
+            ? m_result_ptr
+            : (pointer_type)(m_unified_space ? m_unified_space
+                                             : m_scratch_space);
+
+    value_type init;
+    final_reducer.init(&init);
+
+    if (int_league_size == 0) {
+      final_reducer.final(&value);
+      *result = value;
+    } else if (Impl::cuda_inter_block_reduction(value, init, final_reducer,
+                                                m_scratch_space, result,
+                                                m_scratch_flags, blockDim.y)) {
+      const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
+      if (id == 0) {
+        final_reducer.final(&value);
+        *result = value;
+      }
+    }
+  }
+
+  inline void execute() {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
+    const bool need_device_set = Analysis::has_init_member_function ||
+                                 Analysis::has_final_member_function ||
+                                 !m_result_ptr_host_accessible ||
+#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
+                                 Policy::is_graph_kernel::value ||
+#endif
+                                 !std::is_same<ReducerType, InvalidType>::value;
+    if (!is_empty_range || need_device_set) {
+      const int block_count = std::max(
+          1u, UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
+                               : std::min(int(m_league_size), m_team_size));
+
+      m_scratch_space = cuda_internal_scratch_space(
+          m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                m_functor, m_reducer)) *
+                                block_count);
+      m_scratch_flags =
+          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
+      m_unified_space = cuda_internal_scratch_unified(
+          m_policy.space(), Analysis::value_size(ReducerConditional::select(
+                                m_functor, m_reducer)));
+
+      dim3 block(m_vector_size, m_team_size, 1);
+      dim3 grid(block_count, 1, 1);
+      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
+
+      if (is_empty_range
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+          || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
+#endif
+      ) {
+        block = dim3(1, 1, 1);
+        grid  = dim3(1, 1, 1);
+      }
+
+      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+          *this, grid, block, shmem_size_total,
+          m_policy.space().impl_internal_space_instance(),
+          true);  // copy to device and execute
+
+      if (!m_result_ptr_device_accessible) {
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");
+
+        if (m_result_ptr) {
+          if (m_unified_space) {
+            const int count = Analysis::value_count(
+                ReducerConditional::select(m_functor, m_reducer));
+            for (int i = 0; i < count; ++i) {
+              m_result_ptr[i] = pointer_type(m_unified_space)[i];
+            }
+          } else {
+            const int size = Analysis::value_size(
+                ReducerConditional::select(m_functor, m_reducer));
+            DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
+          }
+        }
+      }
+    } else {
+      if (m_result_ptr) {
+        // TODO @graph We need to effectively insert this in to the graph
+        final_reducer.init(m_result_ptr);
+      }
+    }
+  }
+
+  template <class ViewType>
+  ParallelReduce(
+      const FunctorType& arg_functor, const Policy& arg_policy,
+      const ViewType& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr),
+        m_team_begin(0),
+        m_shmem_begin(0),
+        m_shmem_size(0),
+        m_scratch_ptr{nullptr, nullptr},
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelReduce,
+                           LaunchBounds>::get_cuda_func_attributes();
+    m_team_size =
+        m_team_size >= 0
+            ? m_team_size
+            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
+                  m_policy.space().impl_internal_space_instance(), attr,
+                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
+                  m_policy.thread_scratch_size(0)) /
+                  m_vector_size;
+
+    m_team_begin =
+        UseShflReduction
+            ? 0
+            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                        WorkTag>(arg_functor,
+                                                                 m_team_size);
+    m_shmem_begin = sizeof(double) * (m_team_size + 2);
+    m_shmem_size =
+        m_policy.scratch_size(0, m_team_size) +
+        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks =
+        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (std::min(
+                      static_cast<std::int64_t>(Cuda::concurrency() /
+                                                (m_team_size * m_vector_size)),
+                      static_cast<std::int64_t>(m_league_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
+
+    // The global parallel_reduce does not support vector_length other than 1 at
+    // the moment
+    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
+          "greater than 1 is not currently supported for CUDA for dynamic "
+          "sized reduction types.");
+
+    if ((m_team_size < 32) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
+          "than 32 is not currently supported with CUDA for dynamic sized "
+          "reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
+
+    if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
+        !UseShflReduction) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
+        shmem_size_total) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
+                      "L0 scratch memory"));
+    }
+
+    if (int(m_team_size) >
+        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
+                      "large team size."));
+    }
+  }
+
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::CudaSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_host_accessible(
+            MemorySpaceAccess<Kokkos::HostSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr),
+        m_team_begin(0),
+        m_shmem_begin(0),
+        m_shmem_size(0),
+        m_scratch_ptr{nullptr, nullptr},
+        m_league_size(arg_policy.league_size()),
+        m_team_size(arg_policy.team_size()),
+        m_vector_size(arg_policy.impl_vector_length()) {
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<ParallelReduce,
+                           LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
+    m_team_size =
+        m_team_size >= 0
+            ? m_team_size
+            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
+                  m_policy.space().impl_internal_space_instance(), attr,
+                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
+                  m_policy.thread_scratch_size(0)) /
+                  m_vector_size;
+
+    m_team_begin =
+        UseShflReduction
+            ? 0
+            : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                        WorkTag>(arg_functor,
+                                                                 m_team_size);
+    m_shmem_begin = sizeof(double) * (m_team_size + 2);
+    m_shmem_size =
+        m_policy.scratch_size(0, m_team_size) +
+        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks =
+        m_policy.space().impl_internal_space_instance()->m_scratch_locks;
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (std::min(
+                      static_cast<std::int64_t>(Cuda::concurrency() /
+                                                (m_team_size * m_vector_size)),
+                      static_cast<std::int64_t>(m_league_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
+
+    // The global parallel_reduce does not support vector_length other than 1 at
+    // the moment
+    if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
+          "greater than 1 is not currently supported for CUDA for dynamic "
+          "sized reduction types.");
+
+    if ((m_team_size < 32) && !UseShflReduction)
+      Impl::throw_runtime_exception(
+          "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
+          "than 32 is not currently supported with CUDA for dynamic sized "
+          "reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend
+    // upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
+
+    if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
+         !UseShflReduction) ||
+        m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
+            shmem_size_total) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
+                      "large team size."));
+    }
+  }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+#endif /* defined(KOKKOS_ENABLE_CUDA) */
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 30f5221da..078315b65 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -51,7 +51,6 @@
 #include <utility>
 
 #include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Vectorization.hpp>
 
@@ -69,11 +68,10 @@ namespace Impl {
  *   (c) blockDim.z == 1
  */
 
-template <class ValueType, class JoinOp>
-__device__ inline
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    cuda_intra_warp_reduction(ValueType& result, const JoinOp& join,
-                              const uint32_t max_active_thread = blockDim.y) {
+template <class ValueType, class ReducerType>
+__device__ inline void cuda_intra_warp_reduction(
+    ValueType& result, const ReducerType& reducer,
+    const uint32_t max_active_thread = blockDim.y) {
   unsigned int shift = 1;
 
   // Reduce over values from threads with different threadIdx.y
@@ -81,18 +79,17 @@ __device__ inline
     const ValueType tmp = shfl_down(result, blockDim.x * shift, 32u);
     // Only join if upper thread is active (this allows non power of two for
     // blockDim.y
-    if (threadIdx.y + shift < max_active_thread) join(result, tmp);
+    if (threadIdx.y + shift < max_active_thread) reducer.join(&result, &tmp);
     shift *= 2;
   }
 
   result = shfl(result, 0, 32);
 }
 
-template <class ValueType, class JoinOp>
-__device__ inline
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    cuda_inter_warp_reduction(ValueType& value, const JoinOp& join,
-                              const int max_active_thread = blockDim.y) {
+template <class ValueType, class ReducerType>
+__device__ inline void cuda_inter_warp_reduction(
+    ValueType& value, const ReducerType& reducer,
+    const int max_active_thread = blockDim.y) {
 #define STEP_WIDTH 4
   // Depending on the ValueType _shared__ memory must be aligned up to 8byte
   // boundaries The reason not to use ValueType directly is that for types with
@@ -110,7 +107,7 @@ __device__ inline
   __syncthreads();
   while (shift <= max_active_thread / step) {
     if (shift <= id && shift + STEP_WIDTH > id && threadIdx.x == 0) {
-      join(result[id % STEP_WIDTH], value);
+      reducer.join(&result[id % STEP_WIDTH], &value);
     }
     __syncthreads();
     shift += STEP_WIDTH;
@@ -118,35 +115,30 @@ __device__ inline
 
   value = result[0];
   for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
-    join(value, result[i]);
+    reducer.join(&value, &result[i]);
 }
 
-template <class ValueType, class JoinOp>
-__device__ inline
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    cuda_intra_block_reduction(ValueType& value, const JoinOp& join,
-                               const int max_active_thread = blockDim.y) {
-  cuda_intra_warp_reduction(value, join, max_active_thread);
-  cuda_inter_warp_reduction(value, join, max_active_thread);
+template <class ValueType, class ReducerType>
+__device__ inline void cuda_intra_block_reduction(
+    ValueType& value, const ReducerType& reducer,
+    const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(value, reducer, max_active_thread);
+  cuda_inter_warp_reduction(value, reducer, max_active_thread);
 }
 
-template <class FunctorType, class JoinOp, class ArgTag = void>
+template <class FunctorType>
 __device__ bool cuda_inter_block_reduction(
-    typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value,
-    typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral,
-    const JoinOp& join, Cuda::size_type* const m_scratch_space,
-    typename FunctorValueTraits<FunctorType,
-                                ArgTag>::pointer_type const /*result*/,
+    typename FunctorType::reference_type value,
+    typename FunctorType::reference_type neutral, const FunctorType& reducer,
+    Cuda::size_type* const m_scratch_space,
+    typename FunctorType::pointer_type const /*result*/,
     Cuda::size_type* const m_scratch_flags,
     const int max_active_thread = blockDim.y) {
-#ifdef __CUDA_ARCH__
-  using pointer_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type;
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
+  using pointer_type = typename FunctorType::pointer_type;
+  using value_type   = typename FunctorType::value_type;
 
   // Do the intra-block reduction with shfl operations and static shared memory
-  cuda_intra_block_reduction(value, join, max_active_thread);
+  cuda_intra_block_reduction(value, reducer, max_active_thread);
 
   const int id = threadIdx.y * blockDim.x + threadIdx.x;
 
@@ -182,240 +174,51 @@ __device__ bool cuda_inter_block_reduction(
           blockDim.x * blockDim.y < 32 ? blockDim.x * blockDim.y : 32;
       for (int i = id; i < (int)gridDim.x; i += step_size) {
         value_type tmp = global[i];
-        join(value, tmp);
-      }
-
-      // Perform shfl reductions within the warp only join if contribution is
-      // valid (allows gridDim.x non power of two and <32)
-      if (int(blockDim.x * blockDim.y) > 1) {
-        value_type tmp = Kokkos::shfl_down(value, 1, 32);
-        if (id + 1 < int(gridDim.x)) join(value, tmp);
-      }
-      unsigned int mask = __activemask();
-      __syncwarp(mask);
-      if (int(blockDim.x * blockDim.y) > 2) {
-        value_type tmp = Kokkos::shfl_down(value, 2, 32);
-        if (id + 2 < int(gridDim.x)) join(value, tmp);
-      }
-      __syncwarp(mask);
-      if (int(blockDim.x * blockDim.y) > 4) {
-        value_type tmp = Kokkos::shfl_down(value, 4, 32);
-        if (id + 4 < int(gridDim.x)) join(value, tmp);
-      }
-      __syncwarp(mask);
-      if (int(blockDim.x * blockDim.y) > 8) {
-        value_type tmp = Kokkos::shfl_down(value, 8, 32);
-        if (id + 8 < int(gridDim.x)) join(value, tmp);
-      }
-      __syncwarp(mask);
-      if (int(blockDim.x * blockDim.y) > 16) {
-        value_type tmp = Kokkos::shfl_down(value, 16, 32);
-        if (id + 16 < int(gridDim.x)) join(value, tmp);
-      }
-      __syncwarp(mask);
-    }
-  }
-  // The last block has in its thread=0 the global reduction value through
-  // "value"
-  return last_block;
-#else
-  (void)value;
-  (void)neutral;
-  (void)join;
-  (void)m_scratch_space;
-  (void)m_scratch_flags;
-  (void)max_active_thread;
-  return true;
-#endif
-}
-
-template <class ReducerType>
-__device__ inline
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    cuda_intra_warp_reduction(const ReducerType& reducer,
-                              typename ReducerType::value_type& result,
-                              const uint32_t max_active_thread = blockDim.y) {
-  using ValueType = typename ReducerType::value_type;
-
-  unsigned int shift = 1;
-
-  // Reduce over values from threads with different threadIdx.y
-  while (blockDim.x * shift < 32) {
-    const ValueType tmp = shfl_down(result, blockDim.x * shift, 32u);
-    // Only join if upper thread is active (this allows non power of two for
-    // blockDim.y
-    if (threadIdx.y + shift < max_active_thread) reducer.join(result, tmp);
-    shift *= 2;
-  }
-
-  result              = shfl(result, 0, 32);
-  reducer.reference() = result;
-}
-
-template <class ReducerType>
-__device__ inline
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    cuda_inter_warp_reduction(const ReducerType& reducer,
-                              typename ReducerType::value_type value,
-                              const int max_active_thread = blockDim.y) {
-  using ValueType = typename ReducerType::value_type;
-
-#define STEP_WIDTH 4
-  // Depending on the ValueType _shared__ memory must be aligned up to 8byte
-  // boundaries The reason not to use ValueType directly is that for types with
-  // constructors it could lead to race conditions
-  alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType)
-                                               : alignof(double))
-      __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * STEP_WIDTH];
-  ValueType* result = (ValueType*)&sh_result;
-  const int step    = 32 / blockDim.x;
-  int shift         = STEP_WIDTH;
-  const int id      = threadIdx.y % step == 0 ? threadIdx.y / step : 65000;
-  if (id < STEP_WIDTH) {
-    result[id] = value;
-  }
-  __syncthreads();
-  while (shift <= max_active_thread / step) {
-    if (shift <= id && shift + STEP_WIDTH > id && threadIdx.x == 0) {
-      reducer.join(result[id % STEP_WIDTH], value);
-    }
-    __syncthreads();
-    shift += STEP_WIDTH;
-  }
-
-  value = result[0];
-  for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
-    reducer.join(value, result[i]);
-
-  reducer.reference() = value;
-}
-
-template <class ReducerType>
-__device__ inline
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    cuda_intra_block_reduction(const ReducerType& reducer,
-                               typename ReducerType::value_type value,
-                               const int max_active_thread = blockDim.y) {
-  cuda_intra_warp_reduction(reducer, value, max_active_thread);
-  cuda_inter_warp_reduction(reducer, value, max_active_thread);
-}
-
-template <class ReducerType>
-__device__ inline
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    cuda_intra_block_reduction(const ReducerType& reducer,
-                               const int max_active_thread = blockDim.y) {
-  cuda_intra_block_reduction(reducer, reducer.reference(), max_active_thread);
-}
-
-template <class ReducerType>
-__device__ inline
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, bool>::type
-    cuda_inter_block_reduction(const ReducerType& reducer,
-                               Cuda::size_type* const m_scratch_space,
-                               Cuda::size_type* const m_scratch_flags,
-                               const int max_active_thread = blockDim.y) {
-#ifdef __CUDA_ARCH__
-  using pointer_type = typename ReducerType::value_type*;
-  using value_type   = typename ReducerType::value_type;
-
-  // Do the intra-block reduction with shfl operations and static shared memory
-  cuda_intra_block_reduction(reducer, max_active_thread);
-
-  value_type value = reducer.reference();
-
-  const int id = threadIdx.y * blockDim.x + threadIdx.x;
-
-  // One thread in the block writes block result to global scratch_memory
-  if (id == 0) {
-    pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x;
-    *global             = value;
-  }
-
-  // One warp of last block performs inter block reduction through loading the
-  // block values from global scratch_memory
-  bool last_block = false;
-
-  __threadfence();
-  __syncthreads();
-  if (id < 32) {
-    Cuda::size_type count;
-
-    // Figure out whether this is the last block
-    if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
-    count = Kokkos::shfl(count, 0, 32);
-
-    // Last block does the inter block reduction
-    if (count == gridDim.x - 1) {
-      // set flag back to zero
-      if (id == 0) *m_scratch_flags = 0;
-      last_block = true;
-      reducer.init(value);
-
-      pointer_type const volatile global = (pointer_type)m_scratch_space;
-
-      // Reduce all global values with splitting work over threads in one warp
-      const int step_size =
-          blockDim.x * blockDim.y < 32 ? blockDim.x * blockDim.y : 32;
-      for (int i = id; i < (int)gridDim.x; i += step_size) {
-        value_type tmp = global[i];
-        reducer.join(value, tmp);
+        reducer.join(&value, &tmp);
       }
 
       // Perform shfl reductions within the warp only join if contribution is
       // valid (allows gridDim.x non power of two and <32)
       if (int(blockDim.x * blockDim.y) > 1) {
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
-        if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
+        if (id + 1 < int(gridDim.x)) reducer.join(&value, &tmp);
       }
       unsigned int mask = __activemask();
       __syncwarp(mask);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
-        if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
+        if (id + 2 < int(gridDim.x)) reducer.join(&value, &tmp);
       }
       __syncwarp(mask);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
-        if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
+        if (id + 4 < int(gridDim.x)) reducer.join(&value, &tmp);
       }
       __syncwarp(mask);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
-        if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
+        if (id + 8 < int(gridDim.x)) reducer.join(&value, &tmp);
       }
       __syncwarp(mask);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
-        if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
+        if (id + 16 < int(gridDim.x)) reducer.join(&value, &tmp);
       }
       __syncwarp(mask);
     }
   }
-
   // The last block has in its thread=0 the global reduction value through
   // "value"
   return last_block;
-#else
-  (void)reducer;
-  (void)m_scratch_space;
-  (void)m_scratch_flags;
-  (void)max_active_thread;
-  return true;
-#endif
 }
 
-template <class FunctorType, class ArgTag, bool DoScan, bool UseShfl>
+template <class FunctorType, bool DoScan, bool UseShfl>
 struct CudaReductionsFunctor;
 
-template <class FunctorType, class ArgTag>
-struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
-  using ValueTraits  = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin    = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit    = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps     = FunctorValueOps<FunctorType, ArgTag>;
-  using pointer_type = typename ValueTraits::pointer_type;
-  using Scalar       = typename ValueTraits::value_type;
+template <class FunctorType>
+struct CudaReductionsFunctor<FunctorType, false, true> {
+  using pointer_type = typename FunctorType::pointer_type;
+  using Scalar       = typename FunctorType::value_type;
 
   __device__ static inline void scalar_intra_warp_reduction(
       const FunctorType& functor,
@@ -431,7 +234,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       Scalar tmp = Kokkos::shfl_down(value, delta, width, mask);
-      ValueJoin::join(functor, &value, &tmp);
+      functor.join(&value, &tmp);
     }
 
     Impl::in_place_shfl(result, value, 0, width, mask);
@@ -459,16 +262,16 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
     for (int w = shared_elements; w < num_warps; w += shared_elements) {
       if (warp_id >= w && warp_id < w + shared_elements) {
         if ((threadIdx.y * blockDim.x + threadIdx.x) % 32 == 0)
-          ValueJoin::join(functor, my_shared_team_buffer_element, &value);
+          functor.join(my_shared_team_buffer_element, &value);
       }
       __syncthreads();
     }
 
     if (warp_id == 0) {
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (unsigned int i = threadIdx.y * blockDim.x + threadIdx.x;
            i < blockDim.y * blockDim.x / 32; i += 32)
-        ValueJoin::join(functor, &value, &shared_team_buffer_element[i]);
+        functor.join(&value, &shared_team_buffer_element[i]);
       scalar_intra_warp_reduction(functor, value, false, 32,
                                   *my_global_team_buffer_element);
     }
@@ -504,10 +307,10 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
     if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements;
            i += blockDim.x * blockDim.y) {
-        ValueJoin::join(functor, &value, &global_team_buffer_element[i]);
+        functor.join(&value, &global_team_buffer_element[i]);
       }
       scalar_intra_block_reduction(
           functor, value, false, shared_team_buffer_elements + (blockDim.y - 1),
@@ -517,14 +320,10 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
   }
 };
 
-template <class FunctorType, class ArgTag>
-struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
-  using ValueTraits  = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin    = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit    = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps     = FunctorValueOps<FunctorType, ArgTag>;
-  using pointer_type = typename ValueTraits::pointer_type;
-  using Scalar       = typename ValueTraits::value_type;
+template <class FunctorType>
+struct CudaReductionsFunctor<FunctorType, false, false> {
+  using pointer_type = typename FunctorType::pointer_type;
+  using Scalar       = typename FunctorType::value_type;
 
   __device__ static inline void scalar_intra_warp_reduction(
       const FunctorType& functor,
@@ -539,13 +338,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
             : ((1 << width) - 1)
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
     const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
+
+    __syncwarp(mask);
+
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < 32) {
-        ValueJoin::join(functor, value, value + delta);
+        functor.join(value, value + delta);
       }
       __syncwarp(mask);
     }
-    *value = *(value - lane_id);
+    if (lane_id != 0) {
+      *value = *(value - lane_id);
+    }
   }
 
   __device__ static inline void scalar_intra_block_reduction(
@@ -605,10 +409,10 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
     if (__syncthreads_or(num_teams_done == gridDim.x)) {
       is_last_block = true;
       *global_flags = 0;
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements;
            i += blockDim.x * blockDim.y) {
-        ValueJoin::join(functor, &value, &global_team_buffer_element[i]);
+        functor.join(&value, &global_team_buffer_element[i]);
       }
       scalar_intra_block_reduction(
           functor, value, false, shared_team_buffer_elements + (blockDim.y - 1),
@@ -630,108 +434,127 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
 //----------------------------------------------------------------------------
 /*
  *  Algorithmic constraints:
- *   (a) blockDim.y is a power of two
- *   (b) blockDim.y <= 1024
- *   (c) blockDim.x == blockDim.z == 1
+ *   (a) blockDim.y <= 1024
+ *   (b) blockDim.x == blockDim.z == 1
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType>
 __device__ void cuda_intra_block_reduce_scan(
     const FunctorType& functor,
-    const typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type
-        base_data) {
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
-
-  using pointer_type = typename ValueTraits::pointer_type;
-
-  const unsigned value_count   = ValueTraits::value_count(functor);
-  const unsigned BlockSizeMask = blockDim.y - 1;
-
-  // Must have power of two thread count
-
-  if (BlockSizeMask & blockDim.y) {
-    Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim");
-  }
-
-#define BLOCK_REDUCE_STEP(R, TD, S)                          \
-  if (!(R & ((1 << (S + 1)) - 1))) {                         \
-    ValueJoin::join(functor, TD, (TD - (value_count << S))); \
-  }
-
-#define BLOCK_SCAN_STEP(TD, N, S)                            \
-  if (N == (1 << S)) {                                       \
-    ValueJoin::join(functor, TD, (TD - (value_count << S))); \
-  }
-
-  const unsigned rtid_intra      = threadIdx.y ^ BlockSizeMask;
+    const typename FunctorType::pointer_type base_data) {
+  using pointer_type = typename FunctorType::pointer_type;
+
+  const unsigned value_count = functor.length();
+  const unsigned not_less_power_of_two =
+      (1 << (Impl::int_log2(blockDim.y - 1) + 1));
+  const unsigned BlockSizeMask = not_less_power_of_two - 1;
+  // There is at most one warp that is neither completely full or empty.
+  // For that warp, we shift all indices logically to the end and ignore join
+  // operations with unassigned indices in the warp when performing the intra
+  // warp reduction/scan.
+  const bool is_full_warp = (((threadIdx.y >> CudaTraits::WarpIndexShift) + 1)
+                             << CudaTraits::WarpIndexShift) <= blockDim.y;
+
+  const unsigned mapped_idx =
+      threadIdx.y + (is_full_warp ? 0
+                                  : (not_less_power_of_two - blockDim.y) &
+                                        (CudaTraits::WarpSize - 1));
   const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
+  const pointer_type warp_start =
+      base_data + value_count * ((threadIdx.y >> CudaTraits::WarpIndexShift)
+                                 << CudaTraits::WarpIndexShift);
+
+  auto block_reduce_step = [&functor, value_count](
+                               int const R, pointer_type const TD, int const S,
+                               pointer_type memory_start, int index_shift) {
+    const auto join_ptr = TD - (value_count << S) + value_count * index_shift;
+    if (((R + 1) & ((1 << (S + 1)) - 1)) == 0 && join_ptr >= memory_start) {
+      functor.join(TD, join_ptr);
+    }
+  };
+
+  auto block_scan_step = [&functor, value_count](
+                             int const R, pointer_type const TD, int const S,
+                             pointer_type memory_start, int index_shift) {
+    const auto N        = (1 << (S + 1));
+    const auto join_ptr = TD - (value_count << S) + value_count * index_shift;
+    if (R >= N && ((R + 1) & (N - 1)) == (N >> 1) && join_ptr >= memory_start) {
+      functor.join(TD, join_ptr);
+    }
+  };
 
   {  // Intra-warp reduction:
     __syncwarp(0xffffffff);
-    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
+    block_reduce_step(mapped_idx, tdata_intra, 0, warp_start, 0);
     __syncwarp(0xffffffff);
-    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
+    block_reduce_step(mapped_idx, tdata_intra, 1, warp_start, 0);
     __syncwarp(0xffffffff);
-    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
+    block_reduce_step(mapped_idx, tdata_intra, 2, warp_start, 0);
     __syncwarp(0xffffffff);
-    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
+    block_reduce_step(mapped_idx, tdata_intra, 3, warp_start, 0);
     __syncwarp(0xffffffff);
-    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
+    block_reduce_step(mapped_idx, tdata_intra, 4, warp_start, 0);
     __syncwarp(0xffffffff);
   }
 
   __syncthreads();  // Wait for all warps to reduce
 
-  {  // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
-    const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
-                                << CudaTraits::WarpIndexShift;
-
-    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
-    if (rtid_inter < blockDim.y) {
+  // Inter-warp reduce-scan by a single warp to avoid extra synchronizations.
+  {
+    // There is at most one warp where the memory address to be used is not
+    // (CudaTraits::WarpSize - 1) away from the warp start adress. For the
+    // following reduction, we shift all indices logically to the end of the
+    // next power-of-two to the number of warps.
+    const unsigned n_active_warps =
+        ((blockDim.y - 1) >> CudaTraits::WarpIndexShift) + 1;
+    const unsigned inner_mask =
+        __ballot_sync(0xffffffff, (threadIdx.y < n_active_warps));
+    if (threadIdx.y < n_active_warps) {
+      const bool is_full_warp_inter =
+          threadIdx.y < (blockDim.y >> CudaTraits::WarpIndexShift);
       const pointer_type tdata_inter =
-          base_data + value_count * (rtid_inter ^ BlockSizeMask);
+          base_data +
+          value_count * (is_full_warp_inter
+                             ? (threadIdx.y << CudaTraits::WarpIndexShift) +
+                                   (CudaTraits::WarpSize - 1)
+                             : blockDim.y - 1);
+      const unsigned index_shift =
+          is_full_warp_inter
+              ? 0
+              : blockDim.y - (threadIdx.y << CudaTraits::WarpIndexShift);
+      const int rtid_inter = (threadIdx.y << CudaTraits::WarpIndexShift) +
+                             (CudaTraits::WarpSize - 1) - index_shift;
 
       if ((1 << 5) < BlockSizeMask) {
         __syncwarp(inner_mask);
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
+        block_reduce_step(rtid_inter, tdata_inter, 5, base_data, index_shift);
       }
       if ((1 << 6) < BlockSizeMask) {
         __syncwarp(inner_mask);
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
+        block_reduce_step(rtid_inter, tdata_inter, 6, base_data, index_shift);
       }
       if ((1 << 7) < BlockSizeMask) {
         __syncwarp(inner_mask);
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
+        block_reduce_step(rtid_inter, tdata_inter, 7, base_data, index_shift);
       }
       if ((1 << 8) < BlockSizeMask) {
         __syncwarp(inner_mask);
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
+        block_reduce_step(rtid_inter, tdata_inter, 8, base_data, index_shift);
       }
       if ((1 << 9) < BlockSizeMask) {
         __syncwarp(inner_mask);
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
+        block_reduce_step(rtid_inter, tdata_inter, 9, base_data, index_shift);
       }
 
       if (DoScan) {
-        int n =
-            (rtid_inter & 32)
-                ? 32
-                : ((rtid_inter & 64)
-                       ? 64
-                       : ((rtid_inter & 128) ? 128
-                                             : ((rtid_inter & 256) ? 256 : 0)));
-
-        if (!(rtid_inter + n < blockDim.y)) n = 0;
-
         __syncwarp(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
+        block_scan_step(rtid_inter, tdata_inter, 8, base_data, index_shift);
         __syncwarp(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
+        block_scan_step(rtid_inter, tdata_inter, 7, base_data, index_shift);
         __syncwarp(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
+        block_scan_step(rtid_inter, tdata_inter, 6, base_data, index_shift);
         __syncwarp(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
+        block_scan_step(rtid_inter, tdata_inter, 5, base_data, index_shift);
       }
     }
   }
@@ -739,32 +562,27 @@ __device__ void cuda_intra_block_reduce_scan(
   __syncthreads();  // Wait for inter-warp reduce-scan to complete
 
   if (DoScan) {
-    int n =
-        (rtid_intra & 1)
-            ? 1
-            : ((rtid_intra & 2)
-                   ? 2
-                   : ((rtid_intra & 4)
-                          ? 4
-                          : ((rtid_intra & 8) ? 8
-                                              : ((rtid_intra & 16) ? 16 : 0))));
-
-    if (!(rtid_intra + n < blockDim.y)) n = 0;
+    block_scan_step(mapped_idx, tdata_intra, 4, warp_start, 0);
+    __threadfence_block();
     __syncwarp(0xffffffff);
-    BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
+    block_scan_step(mapped_idx, tdata_intra, 3, warp_start, 0);
+    __threadfence_block();
     __syncwarp(0xffffffff);
-    BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
+    block_scan_step(mapped_idx, tdata_intra, 2, warp_start, 0);
+    __threadfence_block();
     __syncwarp(0xffffffff);
-    BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
+    block_scan_step(mapped_idx, tdata_intra, 1, warp_start, 0);
+    __threadfence_block();
     __syncwarp(0xffffffff);
-    BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
+    block_scan_step(mapped_idx, tdata_intra, 0, warp_start, 0);
+    __threadfence_block();
     __syncwarp(0xffffffff);
-    BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
+    // Update with total from previous warps
+    if (mapped_idx >= CudaTraits::WarpSize &&
+        (mapped_idx & (CudaTraits::WarpSize - 1)) != (CudaTraits::WarpSize - 1))
+      functor.join(tdata_intra, warp_start - value_count);
     __syncwarp(0xffffffff);
   }
-
-#undef BLOCK_SCAN_STEP
-#undef BLOCK_REDUCE_STEP
 }
 
 //----------------------------------------------------------------------------
@@ -776,19 +594,14 @@ __device__ void cuda_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag,
-          class SizeType = Cuda::size_type>
+template <bool DoScan, class FunctorType, class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
     const FunctorType& functor, const Cuda::size_type block_id,
     const Cuda::size_type block_count, SizeType* const shared_data,
     SizeType* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = SizeType;
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps    = FunctorValueOps<FunctorType, ArgTag>;
-
-  using pointer_type = typename ValueTraits::pointer_type;
+  using size_type    = SizeType;
+  using value_type   = typename FunctorType::value_type;
+  using pointer_type = typename FunctorType::pointer_type;
 
   // '__ffs' = position of the least significant bit set to 1.
   // 'blockDim.y' is guaranteed to be a power of two so this
@@ -803,14 +616,14 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
         "blockDim");
   }
 
-  const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                 sizeof(size_type)>
-      word_count(ValueTraits::value_size(functor) / sizeof(size_type));
+  const integral_nonzero_constant<
+      size_type, std::is_pointer<typename FunctorType::reference_type>::value
+                     ? 0
+                     : sizeof(value_type) / sizeof(size_type)>
+      word_count((sizeof(value_type) * functor.length()) / sizeof(size_type));
 
   // Reduce the accumulation for the entire block.
-  cuda_intra_block_reduce_scan<false, FunctorType, ArgTag>(
-      functor, pointer_type(shared_data));
-
+  cuda_intra_block_reduce_scan<false>(functor, pointer_type(shared_data));
   {
     // Write accumulation total to global scratch space.
     // Accumulation total is the last thread's data.
@@ -840,31 +653,34 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
 
     {
       void* const shared_ptr = shared_data + word_count.value * threadIdx.y;
-      /* reference_type shared_value = */ ValueInit::init(functor, shared_ptr);
+      /* reference_type shared_value = */ functor.init(
+          static_cast<pointer_type>(shared_ptr));
 
       for (size_type i = b; i < e; ++i) {
-        ValueJoin::join(functor, shared_ptr,
-                        global_data + word_count.value * i);
+        functor.join(
+            static_cast<pointer_type>(shared_ptr),
+            reinterpret_cast<pointer_type>(global_data + word_count.value * i));
       }
     }
 
-    cuda_intra_block_reduce_scan<DoScan, FunctorType, ArgTag>(
-        functor, pointer_type(shared_data));
+    cuda_intra_block_reduce_scan<DoScan>(functor, pointer_type(shared_data));
 
     if (DoScan) {
-      size_type* const shared_value =
+      pointer_type const shared_value = reinterpret_cast<pointer_type>(
           shared_data +
-          word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y);
+          word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y));
 
       if (!threadIdx.y) {
-        ValueInit::init(functor, shared_value);
+        functor.init(shared_value);
       }
 
       // Join previous inclusive scan value to each member
       for (size_type i = b; i < e; ++i) {
         size_type* const global_value = global_data + word_count.value * i;
-        ValueJoin::join(functor, shared_value, global_value);
-        ValueOps ::copy(functor, global_value, shared_value);
+        functor.join(shared_value,
+                     reinterpret_cast<pointer_type>(global_value));
+        functor.copy(reinterpret_cast<pointer_type>(global_value),
+                     reinterpret_cast<pointer_type>(shared_value));
       }
     }
   }
@@ -872,29 +688,42 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
   return is_last_block;
 }
 
-template <bool DoScan, class FunctorType, class ArgTag,
-          class SizeType = Cuda::size_type>
+template <bool DoScan, class FunctorType, class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
     const FunctorType& functor, const Cuda::size_type block_id,
     const Cuda::size_type block_count, SizeType* const shared_data,
     SizeType* const global_data, Cuda::size_type* const global_flags) {
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  if (!DoScan && ValueTraits::StaticValueSize > 0)
+  if (!DoScan && !std::is_pointer<typename FunctorType::reference_type>::value)
     return Kokkos::Impl::CudaReductionsFunctor<
-        FunctorType, ArgTag, false, (ValueTraits::StaticValueSize > 16)>::
+        FunctorType, false, (sizeof(typename FunctorType::value_type) > 16)>::
         scalar_inter_block_reduction(functor, block_id, block_count,
                                      shared_data, global_data, global_flags);
   else
-    return cuda_single_inter_block_reduce_scan2<DoScan, FunctorType, ArgTag>(
+    return cuda_single_inter_block_reduce_scan2<DoScan>(
         functor, block_id, block_count, shared_data, global_data, global_flags);
 }
 
 // Size in bytes required for inter block reduce or scan
 template <bool DoScan, class FunctorType, class ArgTag>
-inline unsigned cuda_single_inter_block_reduce_scan_shmem(
-    const FunctorType& functor, const unsigned BlockSize) {
-  return (BlockSize + 2) *
-         Impl::FunctorValueTraits<FunctorType, ArgTag>::value_size(functor);
+inline std::enable_if_t<DoScan, unsigned>
+cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
+                                          const unsigned BlockSize) {
+  using Analysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+
+  return (BlockSize + 2) * Analysis::value_size(functor);
+}
+
+template <bool DoScan, class FunctorType, class ArgTag>
+inline std::enable_if_t<!DoScan, unsigned>
+cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
+                                          const unsigned BlockSize) {
+  using Analysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                            RangePolicy<Cuda, ArgTag>, FunctorType>;
+
+  return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
 template <typename WorkTag, typename Policy, typename FunctorType>
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
index 777f57ced..8f05448b1 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_TASKDAG)
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 88ac0d187..1f2e394f1 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -338,9 +338,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
 template <class Scheduler>
 class TaskQueueSpecializationConstrained<
-    Scheduler,
-    typename std::enable_if<std::is_same<typename Scheduler::execution_space,
-                                         Kokkos::Cuda>::value>::type> {
+    Scheduler, std::enable_if_t<std::is_same<
+                   typename Scheduler::execution_space, Kokkos::Cuda>::value>> {
  public:
   using scheduler_type  = Scheduler;
   using execution_space = Kokkos::Cuda;
@@ -780,12 +779,12 @@ namespace Kokkos {
 // template<typename iType1, typename iType2>
 // KOKKOS_INLINE_FUNCTION
 // Impl::TeamThreadRangeBoundariesStruct
-//  < typename std::common_type<iType1,iType2>::type
+//  < std::common_type_t<iType1,iType2>
 //  , Impl::TaskExec< Kokkos::Cuda > >
 // TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
 //               , const iType1 & begin, const iType2 & end )
 //{
-//  using iType = typename std::common_type< iType1, iType2 >::type;
+//  using iType = std::common_type_t< iType1, iType2 >;
 //  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec<
 //  Kokkos::Cuda > >(
 //           thread, iType(begin), iType(end) );
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index 922b980a2..ffafc47f0 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -74,8 +74,7 @@ struct CudaJoinFunctor {
   using value_type = Type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   volatile const value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update += input;
   }
 };
@@ -131,28 +130,20 @@ class CudaTeamMember {
   KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
   KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
-#ifdef __CUDA_ARCH__
-    return threadIdx.y;
-#else
-    return 0;
-#endif
+    KOKKOS_IF_ON_DEVICE((return threadIdx.y;))
+    KOKKOS_IF_ON_HOST((return 0;))
   }
 
   KOKKOS_INLINE_FUNCTION int team_size() const {
-#ifdef __CUDA_ARCH__
-    return blockDim.y;
-#else
-    return 1;
-#endif
+    KOKKOS_IF_ON_DEVICE((return blockDim.y;))
+    KOKKOS_IF_ON_HOST((return 1;))
   }
 
   KOKKOS_INLINE_FUNCTION void team_barrier() const {
-#ifdef __CUDA_ARCH__
-    if (1 == blockDim.z)
-      __syncthreads();  // team == block
-    else
-      __threadfence_block();  // team <= warp
-#endif
+    KOKKOS_IF_ON_DEVICE((
+        if (1 == blockDim.z) { __syncthreads(); }  // team == block
+        else { __threadfence_block(); }            // team <= warp
+        ))
   }
 
   //--------------------------------------------------------------------------
@@ -162,21 +153,21 @@ class CudaTeamMember {
                                              const int& thread_id) const {
     (void)val;
     (void)thread_id;
-#ifdef __CUDA_ARCH__
-    if (1 == blockDim.z) {  // team == block
-      __syncthreads();
-      // Wait for shared data write until all threads arrive here
-      if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) {
-        *((ValueType*)m_team_reduce) = val;
-      }
-      __syncthreads();  // Wait for shared data read until root thread writes
-      val = *((ValueType*)m_team_reduce);
-    } else {               // team <= warp
-      ValueType tmp(val);  // input might not be a register variable
-      Impl::in_place_shfl(val, tmp, blockDim.x * thread_id,
-                          blockDim.x * blockDim.y);
-    }
-#endif
+    KOKKOS_IF_ON_DEVICE((
+        if (1 == blockDim.z) {  // team == block
+          __syncthreads();
+          // Wait for shared data write until all threads arrive here
+          if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) {
+            *((ValueType*)m_team_reduce) = val;
+          }
+          __syncthreads();  // Wait for shared data read until root thread
+                            // writes
+          val = *((ValueType*)m_team_reduce);
+        } else {               // team <= warp
+          ValueType tmp(val);  // input might not be a register variable
+          Impl::in_place_shfl(val, tmp, blockDim.x * thread_id,
+                              blockDim.x * blockDim.y);
+        }))
   }
 
   template <class Closure, class ValueType>
@@ -185,23 +176,23 @@ class CudaTeamMember {
     (void)f;
     (void)val;
     (void)thread_id;
-#ifdef __CUDA_ARCH__
-    f(val);
-
-    if (1 == blockDim.z) {  // team == block
-      __syncthreads();
-      // Wait for shared data write until all threads arrive here
-      if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) {
-        *((ValueType*)m_team_reduce) = val;
-      }
-      __syncthreads();  // Wait for shared data read until root thread writes
-      val = *((ValueType*)m_team_reduce);
-    } else {               // team <= warp
-      ValueType tmp(val);  // input might not be a register variable
-      Impl::in_place_shfl(val, tmp, blockDim.x * thread_id,
-                          blockDim.x * blockDim.y);
-    }
-#endif
+    KOKKOS_IF_ON_DEVICE((
+        f(val);
+
+        if (1 == blockDim.z) {  // team == block
+          __syncthreads();
+          // Wait for shared data write until all threads arrive here
+          if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) {
+            *((ValueType*)m_team_reduce) = val;
+          }
+          __syncthreads();  // Wait for shared data read until root thread
+                            // writes
+          val = *((ValueType*)m_team_reduce);
+        } else {               // team <= warp
+          ValueType tmp(val);  // input might not be a register variable
+          Impl::in_place_shfl(val, tmp, blockDim.x * thread_id,
+                              blockDim.x * blockDim.y);
+        }))
   }
 
   //--------------------------------------------------------------------------
@@ -220,22 +211,23 @@ class CudaTeamMember {
    *    ( 1 == blockDim.z )
    */
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
     team_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer,
-                  typename ReducerType::value_type& value) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer,
+              typename ReducerType::value_type& value) const noexcept {
     (void)reducer;
     (void)value;
-#ifdef __CUDA_ARCH__
-    cuda_intra_block_reduction(reducer, value, blockDim.y);
-#endif /* #ifdef __CUDA_ARCH__ */
+    KOKKOS_IF_ON_DEVICE(
+        (typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                        TeamPolicy<Cuda>, ReducerType>::Reducer
+             wrapped_reducer(&reducer);
+         cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);
+         reducer.reference() = value;))
   }
 
   //--------------------------------------------------------------------------
@@ -251,36 +243,33 @@ class CudaTeamMember {
   template <typename Type>
   KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value,
                                         Type* const global_accum) const {
-#ifdef __CUDA_ARCH__
-    Type* const base_data = (Type*)m_team_reduce;
+    KOKKOS_IF_ON_DEVICE((
+        Type* const base_data = (Type*)m_team_reduce;
 
-    __syncthreads();  // Don't write in to shared data until all threads have
-                      // entered this function
+        __syncthreads();  // Don't write in to shared data until all threads
+                          // have entered this function
 
-    if (0 == threadIdx.y) {
-      base_data[0] = 0;
-    }
+        if (0 == threadIdx.y) { base_data[0] = 0; }
 
-    base_data[threadIdx.y + 1] = value;
+        base_data[threadIdx.y + 1] = value;
+        Impl::CudaJoinFunctor<Type> cuda_join_functor;
+        typename Impl::FunctorAnalysis<
+            Impl::FunctorPatternInterface::SCAN, TeamPolicy<Cuda>,
+            Impl::CudaJoinFunctor<Type>>::Reducer reducer(&cuda_join_functor);
+        Impl::cuda_intra_block_reduce_scan<true>(reducer, base_data + 1);
 
-    Impl::cuda_intra_block_reduce_scan<true, Impl::CudaJoinFunctor<Type>, void>(
-        Impl::CudaJoinFunctor<Type>(), base_data + 1);
+        if (global_accum) {
+          if (blockDim.y == threadIdx.y + 1) {
+            base_data[blockDim.y] =
+                atomic_fetch_add(global_accum, base_data[blockDim.y]);
+          }
+          __syncthreads();  // Wait for atomic
+          base_data[threadIdx.y] += base_data[blockDim.y];
+        }
 
-    if (global_accum) {
-      if (blockDim.y == threadIdx.y + 1) {
-        base_data[blockDim.y] =
-            atomic_fetch_add(global_accum, base_data[blockDim.y]);
-      }
-      __syncthreads();  // Wait for atomic
-      base_data[threadIdx.y] += base_data[blockDim.y];
-    }
+        return base_data[threadIdx.y];))
 
-    return base_data[threadIdx.y];
-#else
-    (void)value;
-    (void)global_accum;
-    return Type();
-#endif
+    KOKKOS_IF_ON_HOST(((void)value; (void)global_accum; return Type();))
   }
 
   /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
@@ -296,57 +285,54 @@ class CudaTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer) {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer,
-                    typename ReducerType::value_type& value) {
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer,
+                typename ReducerType::value_type& value) {
     (void)reducer;
     (void)value;
-#ifdef __CUDA_ARCH__
-    if (blockDim.x == 1) return;
-
-    // Intra vector lane shuffle reduction:
-    typename ReducerType::value_type tmp(value);
-    typename ReducerType::value_type tmp2 = tmp;
-
-    unsigned mask =
-        blockDim.x == 32
-            ? 0xffffffff
-            : ((1 << blockDim.x) - 1)
-                  << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-
-    for (int i = blockDim.x; (i >>= 1);) {
-      Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask);
-      if ((int)threadIdx.x < i) {
-        reducer.join(tmp, tmp2);
-      }
-    }
+    KOKKOS_IF_ON_DEVICE(
+        (if (blockDim.x == 1) return;
 
-    // Broadcast from root lane to all other lanes.
-    // Cannot use "butterfly" algorithm to avoid the broadcast
-    // because floating point summation is not associative
-    // and thus different threads could have different results.
+         // Intra vector lane shuffle reduction:
+         typename ReducerType::value_type tmp(value);
+         typename ReducerType::value_type tmp2 = tmp;
 
-    Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask);
-    value               = tmp2;
-    reducer.reference() = tmp2;
-#endif
+         unsigned mask =
+             blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x);
+
+         for (int i = blockDim.x; (i >>= 1);) {
+           Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask);
+           if ((int)threadIdx.x < i) {
+             reducer.join(tmp, tmp2);
+           }
+         }
+
+         // Broadcast from root lane to all other lanes.
+         // Cannot use "butterfly" algorithm to avoid the broadcast
+         // because floating point summation is not associative
+         // and thus different threads could have different results.
+
+         Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask);
+         value = tmp2; reducer.reference() = tmp2;))
   }
 
   //----------------------------------------
   // Private for the driver
 
   KOKKOS_INLINE_FUNCTION
-  CudaTeamMember(void* shared, const int shared_begin, const int shared_size,
-                 void* scratch_level_1_ptr, const int scratch_level_1_size,
-                 const int arg_league_rank, const int arg_league_size)
+  CudaTeamMember(void* shared, const size_t shared_begin,
+                 const size_t shared_size, void* scratch_level_1_ptr,
+                 const size_t scratch_level_1_size, const int arg_league_rank,
+                 const int arg_league_size)
       : m_team_reduce(shared),
         m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
@@ -443,9 +429,9 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::CudaTeamMember>
 TeamThreadRange(const Impl::CudaTeamMember& thread, iType1 begin, iType2 end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -460,10 +446,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::CudaTeamMember>
 TeamVectorRange(const Impl::CudaTeamMember& thread, const iType1& begin,
                 const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -478,10 +464,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::CudaTeamMember>
 ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin,
                   iType2 arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>(
       thread, iType(arg_begin), iType(arg_end));
 }
@@ -513,11 +499,9 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Closure& closure) {
   (void)loop_boundaries;
   (void)closure;
-#ifdef __CUDA_ARCH__
-  for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end;
-       i += blockDim.y)
-    closure(i);
-#endif
+  KOKKOS_IF_ON_DEVICE(
+      (for (iType i = loop_boundaries.start + threadIdx.y;
+            i < loop_boundaries.end; i += blockDim.y) { closure(i); }))
 }
 
 //----------------------------------------------------------------------------
@@ -531,26 +515,22 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  performed and put into result.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
   (void)loop_boundaries;
   (void)closure;
   (void)reducer;
-#ifdef __CUDA_ARCH__
-  typename ReducerType::value_type value;
-  reducer.init(value);
+  KOKKOS_IF_ON_DEVICE(
+      (typename ReducerType::value_type value;
 
-  for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end;
-       i += blockDim.y) {
-    closure(i, value);
-  }
+       reducer.init(value);
 
-  loop_boundaries.member.team_reduce(reducer, value);
+       for (iType i = loop_boundaries.start + threadIdx.y;
+            i < loop_boundaries.end; i += blockDim.y) { closure(i, value); }
 
-#endif
+       loop_boundaries.member.team_reduce(reducer, value);))
 }
 
 /** \brief  Inter-thread parallel_reduce assuming summation.
@@ -562,28 +542,23 @@ KOKKOS_INLINE_FUNCTION
  *  performed and put into result.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
   (void)loop_boundaries;
   (void)closure;
   (void)result;
-#ifdef __CUDA_ARCH__
-  ValueType val;
-  Kokkos::Sum<ValueType> reducer(val);
+  KOKKOS_IF_ON_DEVICE(
+      (ValueType val; Kokkos::Sum<ValueType> reducer(val);
 
-  reducer.init(reducer.reference());
+       reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end;
-       i += blockDim.y) {
-    closure(i, val);
-  }
+       for (iType i = loop_boundaries.start + threadIdx.y;
+            i < loop_boundaries.end; i += blockDim.y) { closure(i, val); }
 
-  loop_boundaries.member.team_reduce(reducer, val);
-  result = reducer.reference();
-#endif
+       loop_boundaries.member.team_reduce(reducer, val);
+       result = reducer.reference();))
 }
 
 template <typename iType, class Closure>
@@ -593,60 +568,52 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Closure& closure) {
   (void)loop_boundaries;
   (void)closure;
-#ifdef __CUDA_ARCH__
-  for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x;
-       i < loop_boundaries.end; i += blockDim.y * blockDim.x)
-    closure(i);
-#endif
+  KOKKOS_IF_ON_DEVICE((for (iType i = loop_boundaries.start +
+                                      threadIdx.y * blockDim.x + threadIdx.x;
+                            i < loop_boundaries.end;
+                            i += blockDim.y * blockDim.x) { closure(i); }))
 }
 
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
   (void)loop_boundaries;
   (void)closure;
   (void)reducer;
-#ifdef __CUDA_ARCH__
-  typename ReducerType::value_type value;
-  reducer.init(value);
+  KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value;
+                       reducer.init(value);
 
-  for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x;
-       i < loop_boundaries.end; i += blockDim.y * blockDim.x) {
-    closure(i, value);
-  }
+                       for (iType i = loop_boundaries.start +
+                                      threadIdx.y * blockDim.x + threadIdx.x;
+                            i < loop_boundaries.end;
+                            i += blockDim.y * blockDim.x) { closure(i, value); }
 
-  loop_boundaries.member.vector_reduce(reducer, value);
-  loop_boundaries.member.team_reduce(reducer, value);
-#endif
+                       loop_boundaries.member.vector_reduce(reducer, value);
+                       loop_boundaries.member.team_reduce(reducer, value);))
 }
 
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
   (void)loop_boundaries;
   (void)closure;
   (void)result;
-#ifdef __CUDA_ARCH__
-  ValueType val;
-  Kokkos::Sum<ValueType> reducer(val);
+  KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum<ValueType> reducer(val);
 
-  reducer.init(reducer.reference());
+                       reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x;
-       i < loop_boundaries.end; i += blockDim.y * blockDim.x) {
-    closure(i, val);
-  }
+                       for (iType i = loop_boundaries.start +
+                                      threadIdx.y * blockDim.x + threadIdx.x;
+                            i < loop_boundaries.end;
+                            i += blockDim.y * blockDim.x) { closure(i, val); }
 
-  loop_boundaries.member.vector_reduce(reducer);
-  loop_boundaries.member.team_reduce(reducer);
-  result = reducer.reference();
-#endif
+                       loop_boundaries.member.vector_reduce(reducer);
+                       loop_boundaries.member.team_reduce(reducer);
+                       result = reducer.reference();))
 }
 
 //----------------------------------------------------------------------------
@@ -664,16 +631,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Closure& closure) {
   (void)loop_boundaries;
   (void)closure;
-#ifdef __CUDA_ARCH__
-  for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end;
-       i += blockDim.x) {
-    closure(i);
-  }
-  __syncwarp(blockDim.x == 32
-                 ? 0xffffffff
-                 : ((1 << blockDim.x) - 1)
-                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#endif
+  KOKKOS_IF_ON_DEVICE((
+      for (iType i = loop_boundaries.start + threadIdx.x;
+           i < loop_boundaries.end; i += blockDim.x) { closure(i); }
+
+      __syncwarp(blockDim.x == 32
+                     ? 0xffffffff
+                     : ((1 << blockDim.x) - 1)
+                           << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);))
 }
 
 //----------------------------------------------------------------------------
@@ -690,26 +655,24 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  constructed value.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<is_reducer<ReducerType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember> const& loop_boundaries,
-                    Closure const& closure, ReducerType const& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember> const& loop_boundaries,
+                Closure const& closure, ReducerType const& reducer) {
   (void)loop_boundaries;
   (void)closure;
   (void)reducer;
-#ifdef __CUDA_ARCH__
+  KOKKOS_IF_ON_DEVICE((
 
-  reducer.init(reducer.reference());
+      reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end;
-       i += blockDim.x) {
-    closure(i, reducer.reference());
-  }
+      for (iType i = loop_boundaries.start + threadIdx.x;
+           i < loop_boundaries.end;
+           i += blockDim.x) { closure(i, reducer.reference()); }
 
-  Impl::CudaTeamMember::vector_reduce(reducer);
+      Impl::CudaTeamMember::vector_reduce(reducer);
 
-#endif
+      ))
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -724,25 +687,22 @@ KOKKOS_INLINE_FUNCTION
  *  constructed value.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!is_reducer<ValueType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::CudaTeamMember> const& loop_boundaries,
-                    Closure const& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!is_reducer<ValueType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::CudaTeamMember> const& loop_boundaries,
+                Closure const& closure, ValueType& result) {
   (void)loop_boundaries;
   (void)closure;
   (void)result;
-#ifdef __CUDA_ARCH__
-  result = ValueType();
+  KOKKOS_IF_ON_DEVICE(
+      (result = ValueType();
 
-  for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end;
-       i += blockDim.x) {
-    closure(i, result);
-  }
+       for (iType i = loop_boundaries.start + threadIdx.x;
+            i < loop_boundaries.end; i += blockDim.x) { closure(i, result); }
 
-  Impl::CudaTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result));
+       Impl::CudaTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result));
 
-#endif
+       ))
 }
 
 //----------------------------------------------------------------------------
@@ -804,79 +764,84 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
  *  The last call to closure has final == true.
  */
 template <typename iType, class Closure, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
-                      iType, Impl::CudaTeamMember>& loop_boundaries,
-                  const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                  iType, Impl::CudaTeamMember>& loop_boundaries,
+              const Closure& closure, const ReducerType& reducer) {
   (void)loop_boundaries;
   (void)closure;
   (void)reducer;
-#ifdef __CUDA_ARCH__
-
-  using value_type = typename ReducerType::value_type;
-  value_type accum;
-  reducer.init(accum);
-  const value_type identity = accum;
-
-  // Loop through boundaries by vector-length chunks
-  // must scan at each iteration
-
-  // All thread "lanes" must loop the same number of times.
-  // Determine an loop end for all thread "lanes."
-  // Requires:
-  //   blockDim.x is power of two and thus
-  //     ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
-  //   1 <= blockDim.x <= CudaTraits::WarpSize
-
-  const int mask = blockDim.x - 1;
-  const unsigned active_mask =
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x;
-  const int rem = loop_boundaries.end & mask;  // == end % blockDim.x
-  const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0);
-
-  for (int i = threadIdx.x; i < end; i += blockDim.x) {
-    value_type val = identity;
-
-    // First acquire per-lane contributions.
-    // This sets i's val to i-1's contribution
-    // to make the latter in_place_shfl_up an
-    // exclusive scan -- the final accumulation
-    // of i's val will be included in the second
-    // closure call later.
-    if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false);
-
-    // Bottom up exclusive scan in triangular pattern
-    // where each CUDA thread is the root of a reduction tree
-    // from the zeroth "lane" to itself.
-    //  [t] += [t-1] if t >= 1
-    //  [t] += [t-2] if t >= 2
-    //  [t] += [t-4] if t >= 4
-    //  ...
-    //  This differs from the non-reducer overload, where an inclusive scan was
-    //  implemented, because in general the binary operator cannot be inverted
-    //  and we would not be able to remove the inclusive contribution by
-    //  inversion.
-    for (int j = 1; j < (int)blockDim.x; j <<= 1) {
-      value_type tmp = identity;
-      Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask);
-      if (j <= (int)threadIdx.x) {
-        reducer.join(val, tmp);
+  KOKKOS_IF_ON_DEVICE((
+
+      using value_type = typename ReducerType::value_type;
+
+      value_type accum;
+
+      reducer.init(accum);
+
+      const value_type identity = accum;
+
+      // Loop through boundaries by vector-length chunks
+      // must scan at each iteration
+
+      // All thread "lanes" must loop the same number of times.
+      // Determine an loop end for all thread "lanes."
+      // Requires:
+      //   blockDim.x is power of two and thus
+      //     ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
+      //   1 <= blockDim.x <= CudaTraits::WarpSize
+
+      const int mask = blockDim.x - 1;
+      const unsigned active_mask =
+          blockDim.x == 32
+              ? 0xffffffff
+              : ((1 << blockDim.x) - 1)
+                    << (threadIdx.y % (32 / blockDim.x)) * blockDim.x;
+      const int rem = loop_boundaries.end & mask;  // == end % blockDim.x
+      const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0);
+
+      for (int i = threadIdx.x; i < end; i += blockDim.x) {
+        value_type val = identity;
+
+        // First acquire per-lane contributions.
+        // This sets i's val to i-1's contribution
+        // to make the latter in_place_shfl_up an
+        // exclusive scan -- the final accumulation
+        // of i's val will be included in the second
+        // closure call later.
+        if (i < loop_boundaries.end && threadIdx.x > 0) {
+          closure(i - 1, val, false);
+        }
+
+        // Bottom up exclusive scan in triangular pattern
+        // where each CUDA thread is the root of a reduction tree
+        // from the zeroth "lane" to itself.
+        //  [t] += [t-1] if t >= 1
+        //  [t] += [t-2] if t >= 2
+        //  [t] += [t-4] if t >= 4
+        //  ...
+        //  This differs from the non-reducer overload, where an inclusive scan
+        //  was implemented, because in general the binary operator cannot be
+        //  inverted and we would not be able to remove the inclusive
+        //  contribution by inversion.
+        for (int j = 1; j < (int)blockDim.x; j <<= 1) {
+          value_type tmp = identity;
+          Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask);
+          if (j <= (int)threadIdx.x) {
+            reducer.join(val, tmp);
+          }
+        }
+
+        // Include accumulation
+        reducer.join(val, accum);
+
+        // Update i's contribution into the val
+        // and add it to accum for next round
+        if (i < loop_boundaries.end) closure(i, val, true);
+        Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask);
       }
-    }
-
-    // Include accumulation
-    reducer.join(val, accum);
 
-    // Update i's contribution into the val
-    // and add it to accum for next round
-    if (i < loop_boundaries.end) closure(i, val, true);
-    Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask);
-  }
-
-#endif
+      ))
 }
 
 //----------------------------------------------------------------------------
@@ -909,13 +874,13 @@ KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::CudaTeamMember>&,
     const FunctorType& lambda) {
   (void)lambda;
-#ifdef __CUDA_ARCH__
-  if (threadIdx.x == 0) lambda();
-  __syncwarp(blockDim.x == 32
-                 ? 0xffffffff
-                 : ((1 << blockDim.x) - 1)
-                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#endif
+  KOKKOS_IF_ON_DEVICE((
+      if (threadIdx.x == 0) { lambda(); }
+
+      __syncwarp(blockDim.x == 32
+                     ? 0xffffffff
+                     : ((1 << blockDim.x) - 1)
+                           << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);))
 }
 
 template <class FunctorType>
@@ -923,13 +888,13 @@ KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::CudaTeamMember>&,
     const FunctorType& lambda) {
   (void)lambda;
-#ifdef __CUDA_ARCH__
-  if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-  __syncwarp(blockDim.x == 32
-                 ? 0xffffffff
-                 : ((1 << blockDim.x) - 1)
-                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#endif
+  KOKKOS_IF_ON_DEVICE((
+      if (threadIdx.x == 0 && threadIdx.y == 0) { lambda(); }
+
+      __syncwarp(blockDim.x == 32
+                     ? 0xffffffff
+                     : ((1 << blockDim.x) - 1)
+                           << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);))
 }
 
 template <class FunctorType, class ValueType>
@@ -938,14 +903,16 @@ KOKKOS_INLINE_FUNCTION void single(
     const FunctorType& lambda, ValueType& val) {
   (void)lambda;
   (void)val;
-#ifdef __CUDA_ARCH__
-  if (threadIdx.x == 0) lambda(val);
-  unsigned mask = blockDim.x == 32
-                      ? 0xffffffff
-                      : ((1 << blockDim.x) - 1)
-                            << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-  Impl::in_place_shfl(val, val, 0, blockDim.x, mask);
-#endif
+  KOKKOS_IF_ON_DEVICE(
+      (if (threadIdx.x == 0) { lambda(val); }
+
+       unsigned mask =
+           blockDim.x == 32
+               ? 0xffffffff
+               : ((1 << blockDim.x) - 1)
+                     << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x);
+
+       Impl::in_place_shfl(val, val, 0, blockDim.x, mask);))
 }
 
 template <class FunctorType, class ValueType>
@@ -955,12 +922,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)single_struct;
   (void)lambda;
   (void)val;
-#ifdef __CUDA_ARCH__
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val, 0);
-#endif
+  KOKKOS_IF_ON_DEVICE(
+      (if (threadIdx.x == 0 && threadIdx.y == 0) { lambda(val); }
+
+       single_struct.team_member.team_broadcast(val, 0);))
 }
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
index 31d3c47e1..d3d881424 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -79,9 +79,9 @@ struct in_place_shfl_op {
   // sizeof(Scalar) <= sizeof(int) case
   template <class Scalar>
   // requires _assignable_from_bits<Scalar>
-  __device__ inline typename std::enable_if<sizeof(Scalar) <= sizeof(int)>::type
-  operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width,
-             unsigned mask = shfl_all_mask) const noexcept {
+  __device__ inline std::enable_if_t<sizeof(Scalar) <= sizeof(int)> operator()(
+      Scalar& out, Scalar const& in, int lane_or_delta, int width,
+      unsigned mask = shfl_all_mask) const noexcept {
     using shfl_type = int;
     union conv_type {
       Scalar orig;
@@ -106,10 +106,9 @@ struct in_place_shfl_op {
   // sizeof(Scalar) == sizeof(double) case
   // requires _assignable_from_bits<Scalar>
   template <class Scalar>
-  __device__ inline
-      typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type
-      operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width,
-                 unsigned mask = shfl_all_mask) const noexcept {
+  __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(double)>
+  operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width,
+             unsigned mask = shfl_all_mask) const noexcept {
     //------------------------------------------------
     reinterpret_cast<double&>(out) = self().do_shfl_op(
         mask, *reinterpret_cast<double const*>(&in), lane_or_delta, width);
@@ -119,10 +118,9 @@ struct in_place_shfl_op {
   // sizeof(Scalar) == sizeof(double) case
   // requires _assignable_from_bits<Scalar>
   template <typename Scalar>
-  __device__ inline
-      typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type
-      operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width,
-                 unsigned mask = shfl_all_mask) const noexcept {
+  __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(double)>
+  operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width,
+             unsigned mask = shfl_all_mask) const noexcept {
     //------------------------------------------------
     int lo   = __double2loint(*reinterpret_cast<const double*>(&val));
     int hi   = __double2hiint(*reinterpret_cast<const double*>(&val));
@@ -136,10 +134,9 @@ struct in_place_shfl_op {
 
   // sizeof(Scalar) > sizeof(double) case
   template <typename Scalar>
-  __device__ inline
-      typename std::enable_if<(sizeof(Scalar) > sizeof(double))>::type
-      operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width,
-                 unsigned mask = shfl_all_mask) const noexcept {
+  __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))>
+  operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width,
+             unsigned mask = shfl_all_mask) const noexcept {
     // TODO DSH shouldn't this be KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF instead of
     //      sizeof(int)? (Need benchmarks to decide which is faster)
     using shuffle_as_t = int;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index dec6ef15e..a17582082 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -69,12 +69,10 @@ struct CudaTextureFetch {
   // Deference operator pulls through texture object and returns by value
   template <typename iType>
   KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const {
-#if defined(__CUDA_ARCH__) && (300 <= __CUDA_ARCH__)
-    AliasType v = tex1Dfetch<AliasType>(m_obj, i + m_offset);
-    return *(reinterpret_cast<ValueType*>(&v));
-#else
-    return m_ptr[i];
-#endif
+    KOKKOS_IF_ON_DEVICE(
+        (AliasType v = tex1Dfetch<AliasType>(m_obj, i + m_offset);
+         return *(reinterpret_cast<ValueType*>(&v));))
+    KOKKOS_IF_ON_HOST((return m_ptr[i];))
   }
 
   // Pointer to referenced memory
@@ -139,11 +137,13 @@ struct CudaLDGFetch {
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const {
-#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
-    AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
-    return *(reinterpret_cast<ValueType*>(&v));
-#else
+#if defined(KOKKOS_ARCH_KEPLER30) || defined(KOKKOS_ARCH_KEPLER32)
     return m_ptr[i];
+#else
+    KOKKOS_IF_ON_DEVICE(
+        (AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
+         return *(reinterpret_cast<ValueType*>(&v));))
+    KOKKOS_IF_ON_HOST((return m_ptr[i];))
 #endif
   }
 
@@ -201,7 +201,7 @@ namespace Impl {
  */
 template <class Traits>
 class ViewDataHandle<
-    Traits, typename std::enable_if<(
+    Traits, std::enable_if_t<(
                 // Is Cuda memory space
                 (std::is_same<typename Traits::memory_space,
                               Kokkos::CudaSpace>::value ||
@@ -215,19 +215,18 @@ class ViewDataHandle<
                  sizeof(typename Traits::const_value_type) == 8 ||
                  sizeof(typename Traits::const_value_type) == 16) &&
                 // Random access trait
-                (Traits::memory_traits::is_random_access != 0))>::type> {
+                (Traits::memory_traits::is_random_access != 0))>> {
  public:
   using track_type = Kokkos::Impl::SharedAllocationTracker;
 
   using value_type  = typename Traits::const_value_type;
   using return_type = typename Traits::const_value_type;  // NOT a reference
 
-  using alias_type = typename std::conditional<
+  using alias_type = std::conditional_t<
       (sizeof(value_type) == 4), int,
-      typename std::conditional<
+      std::conditional_t<
           (sizeof(value_type) == 8), ::int2,
-          typename std::conditional<(sizeof(value_type) == 16), ::int4,
-                                    void>::type>::type>::type;
+          std::conditional_t<(sizeof(value_type) == 16), ::int4, void>>>;
 
 #if defined(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC)
   using handle_type = Kokkos::Impl::CudaLDGFetch<value_type, alias_type>;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index fc52e4151..fb3a6b138 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -63,16 +63,14 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_one(const std::int32_t w) const noexcept {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_one(
+      const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_one(const std::int32_t w) const noexcept {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_one(
+      const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
   }
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp
index 59aac2b52..dcc586372 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp
@@ -50,6 +50,18 @@
 
 #include <hip/hip_runtime.h>
 
+// FIXME_HIP ROCm 4.5 version header include would be <rocm/rocm_version.h>
+#if __has_include(<rocm_version.h>)
+#include <rocm_version.h>
+#define KOKKOS_IMPL_ROCM_VERSION \
+  ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH
+#endif
+
+// FIXME_HIP workaround for ROCm version less than 5.0.2
+#if KOKKOS_IMPL_ROCM_VERSION < 50002
+#define KOKKOS_IMPL_HIP_ABORT_DOES_NOT_PRINT_MESSAGE
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -57,14 +69,8 @@ namespace Impl {
 // directive to the optimizer.
 [[noreturn]] __device__ __attribute__((noinline)) inline void hip_abort(
     char const *msg) {
-#ifdef NDEBUG
-  (void)msg;
-#else
-  // disable printf on release builds, as it has a non-trivial performance
-  // impact
-  printf("Aborting with message `%s'.\n", msg);
-#endif
-  abort();
+  const char empty[] = "";
+  __assert_fail(msg, empty, 0, empty);
   // This loop is never executed. It's intended to suppress warnings that the
   // function returns, even though it does not. This is necessary because
   // abort() is not marked as [[noreturn]], even though it does not return.
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp
index 263ba97d7..88bcab626 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp
@@ -80,9 +80,9 @@ __inline__ __device__ float atomic_exchange(volatile float *const dest,
 }
 
 template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T *const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) {
+__inline__ __device__ T
+atomic_exchange(volatile T *const dest,
+                std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
   int tmp = atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)),
                        *reinterpret_cast<int *>(const_cast<T *>(&val)));
   return reinterpret_cast<T &>(tmp);
@@ -91,9 +91,10 @@ __inline__ __device__ T atomic_exchange(
 template <typename T>
 __inline__ __device__ T atomic_exchange(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T &>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T &>
+        val) {
   using type = unsigned long long int;
 
   type tmp = atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)),
@@ -102,11 +103,10 @@ __inline__ __device__ T atomic_exchange(
 }
 
 template <typename T>
-__inline__ __device__ T
-atomic_exchange(volatile T *const dest,
-                typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                            sizeof(T) != sizeof(long long),
-                                        const T>::type &val) {
+__inline__ __device__ T atomic_exchange(
+    volatile T *const dest,
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
+                     const T> &val) {
   T return_val;
   int done                 = 0;
   unsigned int active      = __ballot(1);
@@ -130,7 +130,7 @@ atomic_exchange(volatile T *const dest,
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
   atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)),
              *reinterpret_cast<int *>(const_cast<T *>(&val)));
 }
@@ -138,9 +138,10 @@ __inline__ __device__ void atomic_assign(
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T &>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T &>
+        val) {
   using type = unsigned long long int;
   atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)),
              *reinterpret_cast<type *>(const_cast<T *>(&val)));
@@ -149,9 +150,10 @@ __inline__ __device__ void atomic_assign(
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(unsigned long long int),
-                            const T &>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) != sizeof(unsigned long long int),
+                     const T &>
+        val) {
   atomic_exchange(dest, val);
 }
 
@@ -177,7 +179,7 @@ inline __device__ unsigned long long int atomic_compare_exchange(
 template <class T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T *dest, T compare,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T &> val) {
   // FIXME_HIP UB
   union U {
     int i;
@@ -194,8 +196,8 @@ __inline__ __device__ T atomic_compare_exchange(
 template <class T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T *dest, T compare,
-    typename std::enable_if<sizeof(T) == sizeof(unsigned long long int),
-                            const T &>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(unsigned long long int), const T &>
+        val) {
   // FIXME_HIP UB
   union U {
     unsigned long long int i;
@@ -213,9 +215,8 @@ __inline__ __device__ T atomic_compare_exchange(
 template <typename T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T *const dest, const T &compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(long long),
-                            const T>::type &val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
+                     const T> &val) {
   T return_val;
   int done                 = 0;
   unsigned int active      = __ballot(1);
@@ -256,9 +257,9 @@ inline __device__ float atomic_fetch_add(volatile float *dest,
 }
 
 template <typename T>
-inline __device__ T atomic_fetch_add(
-    volatile T *const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+inline __device__ T
+atomic_fetch_add(volatile T *const dest,
+                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   // FIXME_HIP UB
   union U {
     int i;
@@ -281,8 +282,7 @@ inline __device__ T atomic_fetch_add(
 template <typename T>
 inline __device__ T atomic_fetch_add(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) == sizeof(long long), const T>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(long long), const T> val) {
   // FIXME_HIP UB
   union U {
     unsigned long long i;
@@ -343,11 +343,11 @@ __inline__ __device__ long long atomic_fetch_add(volatile long long *dest,
 }
 
 template <class T>
-__inline__ __device__ T
-atomic_fetch_add(volatile T *dest,
-                 typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                             sizeof(T) != sizeof(long long),
-                                         const T &>::type val) {
+__inline__ __device__ T atomic_fetch_add(
+    volatile T *dest,
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
+                     const T &>
+        val) {
   T return_val;
   int done                 = 0;
   unsigned int active      = __ballot(1);
@@ -424,8 +424,7 @@ __inline__ __device__ long long atomic_fetch_sub(volatile long long *dest,
 
 template <class T>
 __inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), T>::type val) {
+    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(int), T> val) {
   // FIXME_HIP UB
   union U {
     int i;
@@ -448,8 +447,7 @@ __inline__ __device__ T atomic_fetch_sub(
 template <typename T>
 inline __device__ T atomic_fetch_sub(
     volatile T *const dest,
-    typename std::enable_if<sizeof(T) == sizeof(long long), const T>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(long long), const T> val) {
   // FIXME_HIP UB
   union U {
     unsigned long long i;
@@ -472,8 +470,7 @@ inline __device__ T atomic_fetch_sub(
 
 template <class T>
 __inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest,
-    typename std::enable_if<sizeof(T) == sizeof(char), T>::type val) {
+    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(char), T> val) {
   unsigned int oldval, newval, assume;
   oldval = *reinterpret_cast<volatile unsigned int *>(dest);
 
@@ -488,8 +485,7 @@ __inline__ __device__ T atomic_fetch_sub(
 
 template <class T>
 __inline__ __device__ T atomic_fetch_sub(
-    volatile T *dest,
-    typename std::enable_if<sizeof(T) == sizeof(short), T>::type val) {
+    volatile T *dest, std::enable_if_t<sizeof(T) == sizeof(short), T> val) {
   unsigned int oldval, newval, assume;
   oldval = *reinterpret_cast<int *>(dest);
 
@@ -503,11 +499,10 @@ __inline__ __device__ T atomic_fetch_sub(
 }
 
 template <typename T>
-__inline__ __device__ T
-atomic_fetch_sub(volatile T *const dest,
-                 typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                             sizeof(T) != sizeof(long long),
-                                         const T>::type &val) {
+__inline__ __device__ T atomic_fetch_sub(
+    volatile T *const dest,
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long long),
+                     const T> &val) {
   T return_val;
   int done                 = 0;
   unsigned int active      = __ballot(1);
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 10d9bc015..87551ae50 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -249,12 +249,11 @@ unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
       get_hip_func_attributes_impl<DriverType, LaunchBounds,
                                    BlockType::Preferred>();
   // get preferred blocksize limited by register usage
-  using namespace std::placeholders;
   const unsigned tperb_reg =
       hip_get_preferred_blocksize<DriverType, LaunchBounds>();
   return hip_internal_get_block_size<BlockType::Preferred, DriverType,
                                      LaunchBounds>(
-      hip_instance, std::bind(f, attr, _1), tperb_reg);
+      hip_instance, std::bind(f, attr, std::placeholders::_1), tperb_reg);
 }
 
 // Standardized blocksize deduction for non-teams parallel constructs with LDS
@@ -291,10 +290,9 @@ unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
   hipFuncAttributes attr =
       get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
   // get max blocksize
-  using namespace std::placeholders;
   const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
   return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
-      hip_instance, std::bind(f, attr, _1), tperb_reg);
+      hip_instance, std::bind(f, attr, std::placeholders::_1), tperb_reg);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index a8a0496af..3785cfe80 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -45,6 +45,10 @@
 /*--------------------------------------------------------------------------*/
 /* Kokkos interfaces */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
@@ -175,8 +179,9 @@ HIPInternal::~HIPInternal() {
 
 int HIPInternal::verify_is_initialized(const char *const label) const {
   if (m_hipDev < 0) {
-    std::cerr << "Kokkos::Experimental::HIP::" << label
-              << " : ERROR device not initialized" << std::endl;
+    Kokkos::abort((std::string("Kokkos::Experimental::HIP::") + label +
+                   " : ERROR device not initialized\n")
+                      .c_str());
   }
   return 0 <= m_hipDev;
 }
@@ -421,10 +426,13 @@ void HIPInternal::finalize() {
   this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
 
-  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    if (this == &singleton())
-      (void)Kokkos::Impl::hip_global_unique_token_locks(true);
+  if (this == &singleton()) {
+    (void)Kokkos::Impl::hip_global_unique_token_locks(true);
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 
+  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordHIP =
         Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace>;
 
@@ -436,35 +444,30 @@ void HIPInternal::finalize() {
 
     if (m_manage_stream && m_stream != nullptr)
       KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+  }
 
-    m_hipDev                    = -1;
-    m_hipArch                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = {0, 0, 0};
-    m_maxSharedWords            = 0;
-    m_maxShmemPerBlock          = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+  m_hipDev                    = -1;
+  m_hipArch                   = -1;
+  m_multiProcCount            = 0;
+  m_maxWarpCount              = 0;
+  m_maxBlock                  = {0, 0, 0};
+  m_maxSharedWords            = 0;
+  m_maxShmemPerBlock          = 0;
+  m_scratchSpaceCount         = 0;
+  m_scratchFlagsCount         = 0;
+  m_scratchSpace              = nullptr;
+  m_scratchFlags              = nullptr;
+  m_stream                    = nullptr;
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks));
+  m_scratch_locks = nullptr;
 
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks));
-    m_scratch_locks = nullptr;
-  }
   if (nullptr != d_driverWorkArray) {
     KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     d_driverWorkArray = nullptr;
   }
-
-  // only destroy these if we're finalizing the singleton
-  if (this == &singleton()) {
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
-    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
-  }
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index e9cfbf99f..f1ffaf375 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <HIP/Kokkos_HIP_Locks.hpp>
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 24b05f293..212bbb9ec 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -226,15 +226,14 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                                   WorkTag, void>::type;
 
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
+                                    ReducerTypeFwd>;
 
  public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Experimental::HIP::size_type;
 
@@ -261,17 +260,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   }
 
   inline __device__ void operator()() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
                                                    sizeof(size_type)>
-        word_count(ValueTraits::value_size(
+        word_count(Analysis::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
                    sizeof(size_type));
 
     {
-      reference_type value = ValueInit::init(
-          ReducerConditional::select(m_functor, m_reducer),
+      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
           Experimental::kokkos_impl_hip_shared_memory<size_type>() +
-              threadIdx.y * word_count.value);
+          threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
@@ -284,10 +285,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
     // Reduce with final value at blockDim.y - 1 location.
     // Problem: non power-of-two blockDim
-    if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<
-            false, ReducerTypeFwd, WorkTagFwd>(
-            ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-            gridDim.x, Experimental::kokkos_impl_hip_shared_memory<size_type>(),
+    if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<false>(
+            final_reducer, blockIdx.x, gridDim.x,
+            Experimental::kokkos_impl_hip_shared_memory<size_type>(),
             m_scratch_space, m_scratch_flags)) {
       // This is the final block with the final result at the final threads'
       // location
@@ -299,8 +299,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                     : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), shared);
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (Experimental::Impl::HIPTraits::WarpSize < word_count.value) {
@@ -337,6 +336,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   }
 
   inline void execute() {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
                                        Kokkos::Experimental::HIP>;
     const auto nwork  = m_policy.m_num_tiles;
@@ -356,7 +358,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
               m_policy.space(),
-              ValueTraits::value_size(
+              Analysis::value_size(
                   ReducerConditional::select(m_functor, m_reducer)) *
                   block_size /* block_size == max block_count */);
       m_scratch_flags =
@@ -380,31 +382,24 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence(
-            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
-            "reduction can't access result storage location");
-
-        if (m_result_ptr) {
-          const int size = ValueTraits::value_size(
-              ReducerConditional::select(m_functor, m_reducer));
-          DeepCopy<HostSpace, Experimental::HIPSpace>(m_result_ptr,
-                                                      m_scratch_space, size);
-        }
+      if (!m_result_ptr_device_accessible && m_result_ptr) {
+        const int size = Analysis::value_size(
+            ReducerConditional::select(m_functor, m_reducer));
+        DeepCopy<HostSpace, Experimental::HIPSpace, Experimental::HIP>(
+            m_policy.space(), m_result_ptr, m_scratch_space, size);
       }
     } else {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
+        final_reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
+  ParallelReduce(
+      const FunctorType& arg_functor, const Policy& arg_policy,
+      const ViewType& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 14a282cc3..5c871e0d6 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -76,16 +76,14 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelFor& operator=(const ParallelFor&) = delete;
 
   template <class TagType>
-  inline __device__
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member i) const {
+  inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member i) const {
     m_functor(i);
   }
 
   template <class TagType>
-  inline __device__
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member i) const {
+  inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member i) const {
     m_functor(TagType(), i);
   }
 
@@ -154,15 +152,14 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                                   WorkTag, void>::type;
 
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis =
+      Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy,
+                                    ReducerTypeFwd>;
 
  public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Kokkos::Experimental::HIP::size_type;
   using index_type     = typename Policy::index_type;
@@ -183,7 +180,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   std::lock_guard<std::mutex> m_shared_memory_lock;
 
   static bool constexpr UseShflReduction =
-      static_cast<bool>(ValueTraits::StaticValueSize);
+      static_cast<bool>(Analysis::StaticValueSize);
 
  private:
   struct ShflReductionTag {};
@@ -191,39 +188,37 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // Make the exec_range calls call to Reduce::DeviceIterateTile
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update) const {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
     m_functor(i, update);
   }
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update) const {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update) const {
     m_functor(TagType(), i, update);
   }
 
  public:
   __device__ inline void operator()() const {
-    using ReductionTag =
-        typename std::conditional<UseShflReduction, ShflReductionTag,
-                                  SHMEMReductionTag>::type;
+    using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
+                                            SHMEMReductionTag>;
     run(ReductionTag{});
   }
 
   __device__ inline void run(SHMEMReductionTag) const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
                                                    sizeof(size_type)>
-        word_count(ValueTraits::value_size(
+        word_count(Analysis::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
                    sizeof(size_type));
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
     {
-      reference_type value = ValueInit::init(
-          ReducerConditional::select(m_functor, m_reducer),
+      reference_type value = final_reducer.init(reinterpret_cast<pointer_type>(
           ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
-              threadIdx.y * word_count.value);
+          threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
@@ -243,10 +238,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     // Shortcut for length zero reduction
     bool do_final_reduction = m_policy.begin() == m_policy.end();
     if (!do_final_reduction)
-      do_final_reduction = hip_single_inter_block_reduce_scan<
-          false, ReducerTypeFwd, WorkTagFwd>(
-          ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-          gridDim.x,
+      do_final_reduction = hip_single_inter_block_reduce_scan<false>(
+          final_reducer, blockIdx.x, gridDim.x,
           ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
           m_scratch_space, m_scratch_flags);
     if (do_final_reduction) {
@@ -261,8 +254,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                     : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), shared);
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (::Kokkos::Experimental::Impl::HIPTraits::WarpSize <
@@ -277,8 +269,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   }
 
   __device__ inline void run(ShflReductionTag) const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     value_type value;
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value);
+    final_reducer.init(&value);
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
@@ -302,25 +297,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         (max_active_thread == 0) ? blockDim.y : max_active_thread;
 
     value_type init;
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init);
+    final_reducer.init(&init);
     if (m_policy.begin() == m_policy.end()) {
-      Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-          ReducerConditional::select(m_functor, m_reducer),
-          reinterpret_cast<void*>(&value));
+      final_reducer.final(&value);
       pointer_type const final_result =
           m_result_ptr_device_accessible ? m_result_ptr : result;
       *final_result = value;
-    } else if (Impl::hip_inter_block_shuffle_reduction<ReducerTypeFwd,
-                                                       ValueJoin, WorkTagFwd>(
-                   value, init,
-                   ValueJoin(ReducerConditional::select(m_functor, m_reducer)),
-                   m_scratch_space, result, m_scratch_flags,
-                   max_active_thread)) {
+    } else if (Impl::hip_inter_block_shuffle_reduction<>(
+                   value, init, final_reducer, m_scratch_space, result,
+                   m_scratch_flags, max_active_thread)) {
       unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer),
-            reinterpret_cast<void*>(&value));
+        final_reducer.final(&value);
         pointer_type const final_result =
             m_result_ptr_device_accessible ? m_result_ptr : result;
         *final_result = value;
@@ -342,9 +330,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   }
 
   inline void execute() {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     const index_type nwork     = m_policy.end() - m_policy.begin();
-    const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
-                                 ReduceFunctorHasFinal<FunctorType>::value ||
+    const bool need_device_set = Analysis::has_init_member_function ||
+                                 Analysis::has_final_member_function ||
                                  !m_result_ptr_host_accessible ||
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
@@ -358,7 +349,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
               m_policy.space(),
-              ValueTraits::value_size(
+              Analysis::value_size(
                   ReducerConditional::select(m_functor, m_reducer)) *
                   block_size /* block_size == max block_count */);
       m_scratch_flags =
@@ -390,31 +381,25 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
-      if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence(
-            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
-            "reduction can't access result storage location");
-
-        if (m_result_ptr) {
-          const int size = ValueTraits::value_size(
-              ReducerConditional::select(m_functor, m_reducer));
-          DeepCopy<HostSpace, ::Kokkos::Experimental::HIPSpace>(
-              m_result_ptr, m_scratch_space, size);
-        }
+      if (!m_result_ptr_device_accessible && m_result_ptr) {
+        const int size = Analysis::value_size(
+            ReducerConditional::select(m_functor, m_reducer));
+        DeepCopy<HostSpace, ::Kokkos::Experimental::HIPSpace,
+                 ::Kokkos::Experimental::HIP>(m_policy.space(), m_result_ptr,
+                                              m_scratch_space, size);
       }
     } else {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
+        final_reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
-                 const ViewType& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
+  ParallelReduce(
+      const FunctorType& arg_functor, const Policy& arg_policy,
+      const ViewType& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
@@ -459,13 +444,12 @@ class ParallelScanHIPBase {
   using WorkRange    = typename Policy::WorkRange;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
+  using Analysis = Kokkos::Impl::FunctorAnalysis<FunctorPatternInterface::SCAN,
+                                                 Policy, FunctorType>;
 
  public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Kokkos::Experimental::HIP::size_type;
   using index_type     = typename Policy::index_type;
@@ -489,33 +473,31 @@ class ParallelScanHIPBase {
 
  private:
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
     m_functor(i, update, final_result);
   }
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const Member& i, reference_type update,
-                 const bool final_result) const {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const Member& i, reference_type update, const bool final_result) const {
     m_functor(TagType(), i, update, final_result);
   }
 
   //----------------------------------------
 
   __device__ inline void initial() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
                                                    sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
 
-    size_type* const shared_value =
+    pointer_type const shared_value = reinterpret_cast<pointer_type>(
         Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
-        word_count.value * threadIdx.y;
+        word_count.value * threadIdx.y);
 
-    ValueInit::init(m_functor, shared_value);
+    final_reducer.init(shared_value);
 
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
@@ -527,15 +509,15 @@ class ParallelScanHIPBase {
     for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
          iwork < iwork_end; iwork += blockDim.y) {
       this->template exec_range<WorkTag>(
-          iwork, ValueOps::reference(shared_value), false);
+          iwork, final_reducer.reference(shared_value), false);
     }
 
     // Reduce and scan, writing out scan of blocks' totals and block-groups'
     // totals. Blocks' scan values are written to 'blockIdx.x' location.
     // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
     // gridDim.x
-    hip_single_inter_block_reduce_scan<true, FunctorType, WorkTag>(
-        m_functor, blockIdx.x, gridDim.x,
+    hip_single_inter_block_reduce_scan<true>(
+        final_reducer, blockIdx.x, gridDim.x,
         Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
         m_scratch_space, m_scratch_flags);
   }
@@ -543,9 +525,11 @@ class ParallelScanHIPBase {
   //----------------------------------------
 
   __device__ inline void final() const {
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
                                                    sizeof(size_type)>
-        word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
@@ -564,7 +548,7 @@ class ParallelScanHIPBase {
         shared_accum[i] = block_total[i];
       }
     } else if (0 == threadIdx.y) {
-      ValueInit::init(m_functor, shared_accum);
+      final_reducer.init(reinterpret_cast<pointer_type>(shared_accum));
     }
 
     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
@@ -576,7 +560,8 @@ class ParallelScanHIPBase {
       __syncthreads();  // Don't overwrite previous iteration values until they
                         // are used
 
-      ValueInit::init(m_functor, shared_prefix + word_count.value);
+      final_reducer.init(
+          reinterpret_cast<pointer_type>(shared_prefix + word_count.value));
 
       // Copy previous block's accumulation total into thread[0] prefix and
       // inclusive scan value of this block
@@ -591,14 +576,16 @@ class ParallelScanHIPBase {
       const bool doWork = (iwork < range.end());
       if (doWork) {
         this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix + word_count.value),
+            iwork,
+            final_reducer.reference(reinterpret_cast<pointer_type>(
+                shared_prefix + word_count.value)),
             false);
       }
 
       // Scan block values into locations shared_data[1..blockDim.y]
-      hip_intra_block_reduce_scan<true, FunctorType, WorkTag>(
-          m_functor,
-          typename ValueTraits::pointer_type(shared_data + word_count.value));
+      hip_intra_block_reduce_scan<true>(
+          final_reducer,
+          typename Analysis::pointer_type(shared_data + word_count.value));
 
       {
         size_type* const block_total =
@@ -611,7 +598,10 @@ class ParallelScanHIPBase {
       // Call functor with exclusive scan value
       if (doWork) {
         this->template exec_range<WorkTag>(
-            iwork, ValueOps::reference(shared_prefix), true);
+            iwork,
+            final_reducer.reference(
+                reinterpret_cast<pointer_type>(shared_prefix)),
+            true);
       }
     }
   }
@@ -658,13 +648,13 @@ class ParallelScanHIPBase {
       m_grid_x = (nwork + work_per_block - 1) / work_per_block;
 
       m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
-          m_policy.space(), ValueTraits::value_size(m_functor) * m_grid_x);
+          m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
       m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
           m_policy.space(), sizeof(size_type) * 1);
 
       dim3 grid(m_grid_x, 1, 1);
       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
+      const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
 
       m_final = false;
       // these ones are OK to be just the base because the specializations
@@ -712,7 +702,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const auto& instance =
         Base::m_policy.space().impl_internal_space_instance();
     auto shmem_functor = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+      return hip_single_inter_block_reduce_scan_shmem<true, FunctorType,
                                                       typename Base::WorkTag>(
           f, n);
     };
@@ -740,9 +730,10 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     const auto nwork = Base::m_policy.end() - Base::m_policy.begin();
     if (nwork) {
-      const int size = Base::ValueTraits::value_size(Base::m_functor);
-      DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
-          &m_returnvalue,
+      const int size = Base::Analysis::value_size(Base::m_functor);
+      DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
+               Kokkos::Experimental::HIP>(
+          Base::m_policy.space(), &m_returnvalue,
           Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
           size);
     }
@@ -760,7 +751,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const auto& instance =
         Base::m_policy.space().impl_internal_space_instance();
     auto shmem_functor = [&f](unsigned n) {
-      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+      return hip_single_inter_block_reduce_scan_shmem<true, FunctorType,
                                                       typename Base::WorkTag>(
           f, n);
     };
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 0ec0761f7..69ced48a9 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -75,8 +75,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   int m_league_size;
   int m_team_size;
   int m_vector_length;
-  int m_team_scratch_size[2];
-  int m_thread_scratch_size[2];
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
   int m_chunk_size;
   bool m_tune_team_size;
   bool m_tune_vector_length;
@@ -206,15 +206,17 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   int league_size() const { return m_league_size; }
 
-  int scratch_size(int level, int team_size_ = -1) const {
+  size_t scratch_size(int level, int team_size_ = -1) const {
     if (team_size_ < 0) team_size_ = m_team_size;
     return m_team_scratch_size[level] +
            team_size_ * m_thread_scratch_size[level];
   }
 
-  int team_scratch_size(int level) const { return m_team_scratch_size[level]; }
+  size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
 
-  int thread_scratch_size(int level) const {
+  size_t thread_scratch_size(int level) const {
     return m_thread_scratch_size[level];
   }
 
@@ -359,7 +361,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     // internal_team_size_common_reduce
     //            once we can turn c++17 constexpr on by default.
     //            The problem right now is that we can't turn off the evaluation
-    //            of the functor_value_traits's valuesize / StaticValueSize
+    //            of the Analysis' valuesize / StaticValueSize
 
     const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
     const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
@@ -395,14 +397,16 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   template <BlockType BlockSize, class ClosureType, class FunctorType>
   int internal_team_size_common_reduce(const FunctorType& f) const {
-    using functor_value_traits =
-        Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
-
-    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
-    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
-                                  ((functor_value_traits::StaticValueSize != 0)
-                                       ? 0
-                                       : functor_value_traits::value_size(f));
+    using Interface =
+        typename Impl::DeduceFunctorPatternInterface<ClosureType>::type;
+    using Analysis =
+        Impl::FunctorAnalysis<Interface, typename ClosureType::Policy,
+                              FunctorType>;
+
+    const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread =
+        thread_scratch_size(0) + sizeof(double) +
+        ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f));
     const int vector_length = impl_vector_length();
 
     const auto functor = [&f, shmem_block, shmem_thread, vector_length](
@@ -455,10 +459,10 @@ __device__ inline int64_t hip_get_scratch_index(
   int64_t threadid = 0;
   __shared__ int64_t base_thread_id;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
-    int64_t const wraparound_len = Kokkos::Experimental::min(
-        int64_t(league_size),
-        (int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) /
-            (blockDim.x * blockDim.y));
+    int64_t const wraparound_len =
+        Kokkos::min(int64_t(league_size),
+                    (int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) /
+                        (blockDim.x * blockDim.y));
     threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
     threadid *= blockDim.x * blockDim.y;
     int done = 0;
@@ -513,23 +517,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_begin;
   int m_shmem_size;
   void* m_scratch_ptr[2];
-  int m_scratch_size[2];
+  size_t m_scratch_size[2];
   int32_t* m_scratch_locks;
   // Only let one ParallelFor/Reduce modify the team scratch memory. The
   // constructor acquires the mutex which is released in the destructor.
   std::lock_guard<std::mutex> m_scratch_lock_guard;
 
   template <typename TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_team(const member_type& member) const {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const member_type& member) const {
     m_functor(member);
   }
 
   template <typename TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_team(const member_type& member) const {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const member_type& member) const {
     m_functor(TagType(), member);
   }
 
@@ -647,22 +649,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                                   work_tag, void>::type;
 
-  using value_traits =
-      Kokkos::Impl::FunctorValueTraits<reducer_type_fwd, work_tag_fwd>;
-  using value_init =
-      Kokkos::Impl::FunctorValueInit<reducer_type_fwd, work_tag_fwd>;
-  using value_join =
-      Kokkos::Impl::FunctorValueJoin<reducer_type_fwd, work_tag_fwd>;
+  using analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, reducer_type_fwd>;
 
-  using pointer_type   = typename value_traits::pointer_type;
-  using reference_type = typename value_traits::reference_type;
-  using value_type     = typename value_traits::value_type;
+  using pointer_type   = typename analysis::pointer_type;
+  using reference_type = typename analysis::reference_type;
+  using value_type     = typename analysis::value_type;
 
  public:
   using functor_type = FunctorType;
   using size_type    = Kokkos::Experimental::HIP::size_type;
 
-  static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0);
+  static int constexpr UseShflReduction = (analysis::StaticValueSize != 0);
 
  private:
   struct ShflReductionTag {};
@@ -688,7 +686,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type m_shmem_begin;
   size_type m_shmem_size;
   void* m_scratch_ptr[2];
-  int m_scratch_size[2];
+  size_t m_scratch_size[2];
   int32_t* m_scratch_locks;
   const size_type m_league_size;
   int m_team_size;
@@ -698,16 +696,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   std::lock_guard<std::mutex> m_scratch_lock_guard;
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_team(member_type const& member, reference_type update) const {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      member_type const& member, reference_type update) const {
     m_functor(member, update);
   }
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_team(member_type const& member, reference_type update) const {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      member_type const& member, reference_type update) const {
     m_functor(TagType(), member, update);
   }
 
@@ -747,16 +743,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(SHMEMReductionTag, int const threadid) const {
-    integral_nonzero_constant<size_type, value_traits::StaticValueSize /
+    typename analysis::Reducer final_reducer(
+        &reducer_conditional::select(m_functor, m_reducer));
+
+    integral_nonzero_constant<size_type, analysis::StaticValueSize /
                                              sizeof(size_type)> const
-        word_count(value_traits::value_size(
+        word_count(analysis::value_size(
                        reducer_conditional::select(m_functor, m_reducer)) /
                    sizeof(size_type));
 
-    reference_type value = value_init::init(
-        reducer_conditional::select(m_functor, m_reducer),
+    reference_type value = final_reducer.init(
         Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
-            threadIdx.y * word_count.value);
+        threadIdx.y * word_count.value);
 
     // Iterate this block through the league
     iterate_through_league(threadid, value);
@@ -782,8 +780,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                     : m_scratch_space;
 
       if (threadIdx.y == 0) {
-        Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
-            reducer_conditional::select(m_functor, m_reducer), shared);
+        final_reducer.final(reinterpret_cast<value_type*>(shared));
       }
 
       if (Kokkos::Experimental::Impl::HIPTraits::WarpSize < word_count.value) {
@@ -797,8 +794,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   __device__ inline void run(ShflReductionTag, int const threadid) const {
+    typename analysis::Reducer final_reducer(
+        &reducer_conditional::select(m_functor, m_reducer));
+
     value_type value;
-    value_init::init(reducer_conditional::select(m_functor, m_reducer), &value);
+    final_reducer.init(&value);
 
     // Iterate this block through the league
     iterate_through_league(threadid, value);
@@ -809,32 +809,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
             : reinterpret_cast<pointer_type>(m_scratch_space);
 
     value_type init;
-    value_init::init(reducer_conditional::select(m_functor, m_reducer), &init);
+    final_reducer.init(&init);
     if (m_league_size == 0) {
-      Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
-          reducer_conditional::select(m_functor, m_reducer),
-          reinterpret_cast<void*>(&value));
+      final_reducer.final(&value);
       *result = value;
-    } else if (Impl::hip_inter_block_shuffle_reduction<FunctorType, value_join,
-                                                       work_tag>(
-                   value, init,
-                   value_join(
-                       reducer_conditional::select(m_functor, m_reducer)),
-                   m_scratch_space, result, m_scratch_flags, blockDim.y)) {
+    } else if (Impl::hip_inter_block_shuffle_reduction(
+                   value, init, final_reducer, m_scratch_space, result,
+                   m_scratch_flags, blockDim.y)) {
       unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
-        Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
-            reducer_conditional::select(m_functor, m_reducer),
-            reinterpret_cast<void*>(&value));
+        final_reducer.final(&value);
         *result = value;
       }
     }
   }
 
   inline void execute() {
+    typename analysis::Reducer final_reducer(
+        &reducer_conditional::select(m_functor, m_reducer));
+
     const bool is_empty_range  = m_league_size == 0 || m_team_size == 0;
-    const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
-                                 ReduceFunctorHasFinal<FunctorType>::value ||
+    const bool need_device_set = analysis::has_init_member_function ||
+                                 analysis::has_final_member_function ||
                                  !m_result_ptr_host_accessible ||
                                  !std::is_same<ReducerType, InvalidType>::value;
     if (!is_empty_range || need_device_set) {
@@ -847,10 +843,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               : std::min(static_cast<int>(m_league_size), m_team_size);
 
       m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
-          m_policy.space(),
-          value_traits::value_size(
-              reducer_conditional::select(m_functor, m_reducer)) *
-              block_count);
+          m_policy.space(), analysis::value_size(reducer_conditional::select(
+                                m_functor, m_reducer)) *
+                                block_count);
       m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
           m_policy.space(), sizeof(size_type));
 
@@ -875,7 +870,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy.space().impl_internal_space_instance()->fence();
 
         if (m_result_ptr) {
-          const int size = value_traits::value_size(
+          const int size = analysis::value_size(
               reducer_conditional::select(m_functor, m_reducer));
           DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
               m_result_ptr, m_scratch_space, size);
@@ -883,17 +878,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
     } else {
       if (m_result_ptr) {
-        value_init::init(reducer_conditional::select(m_functor, m_reducer),
-                         m_result_ptr);
+        final_reducer.init(m_result_ptr);
       }
     }
   }
 
   template <class ViewType>
-  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
-                 ViewType const& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
+  ParallelReduce(
+      FunctorType const& arg_functor, Policy const& arg_policy,
+      ViewType const& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
index 7929e6df7..1091ad5ce 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@@ -58,17 +58,13 @@ namespace Impl {
 // Reduction-only implementation
 //----------------------------------------------------------------------------
 
-template <class FunctorType, class ArgTag, bool UseShfl>
+template <class FunctorType, bool UseShfl>
 struct HIPReductionsFunctor;
 
-template <typename FunctorType, typename ArgTag>
-struct HIPReductionsFunctor<FunctorType, ArgTag, true> {
-  using ValueTraits  = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin    = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit    = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps     = FunctorValueOps<FunctorType, ArgTag>;
-  using pointer_type = typename ValueTraits::pointer_type;
-  using Scalar       = typename ValueTraits::value_type;
+template <typename FunctorType>
+struct HIPReductionsFunctor<FunctorType, true> {
+  using pointer_type = typename FunctorType::pointer_type;
+  using Scalar       = typename FunctorType::value_type;
 
   __device__ static inline void scalar_intra_warp_reduction(
       FunctorType const& functor,
@@ -79,7 +75,7 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, true> {
       Scalar& result) {
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       Scalar tmp = Kokkos::Experimental::shfl_down(value, delta, width);
-      ValueJoin::join(functor, &value, &tmp);
+      functor.join(&value, &tmp);
     }
 
     Experimental::Impl::in_place_shfl(result, value, 0, width);
@@ -109,16 +105,16 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, true> {
     for (int w = shared_elements; w < num_warps; w += shared_elements) {
       if (warp_id >= w && warp_id < w + shared_elements) {
         if ((threadIdx.y * blockDim.x + threadIdx.x) % warp_size == 0)
-          ValueJoin::join(functor, my_shared_team_buffer_element, &value);
+          functor.join(my_shared_team_buffer_element, &value);
       }
       __syncthreads();
     }
 
     if (warp_id == 0) {
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (unsigned int i = threadIdx.y * blockDim.x + threadIdx.x;
            i < blockDim.y * blockDim.x / warp_size; i += warp_size) {
-        ValueJoin::join(functor, &value, &shared_team_buffer_element[i]);
+        functor.join(&value, &shared_team_buffer_element[i]);
       }
       scalar_intra_warp_reduction(functor, value, false, warp_size,
                                   *my_global_team_buffer_element);
@@ -163,10 +159,10 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, true> {
     if (num_teams_done == gridDim.x) {
       is_last_block = true;
       *global_flags = 0;
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements;
            i += blockDim.x * blockDim.y) {
-        ValueJoin::join(functor, &value, &global_team_buffer_element[i]);
+        functor.join(&value, &global_team_buffer_element[i]);
       }
       scalar_intra_block_reduction(
           functor, value, false, shared_team_buffer_elements + blockDim.y - 1,
@@ -177,14 +173,10 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, true> {
   }
 };
 
-template <typename FunctorType, typename ArgTag>
-struct HIPReductionsFunctor<FunctorType, ArgTag, false> {
-  using ValueTraits  = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin    = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit    = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps     = FunctorValueOps<FunctorType, ArgTag>;
-  using pointer_type = typename ValueTraits::pointer_type;
-  using Scalar       = typename ValueTraits::value_type;
+template <typename FunctorType>
+struct HIPReductionsFunctor<FunctorType, false> {
+  using pointer_type = typename FunctorType::pointer_type;
+  using Scalar       = typename FunctorType::value_type;
 
   __device__ static inline void scalar_intra_warp_reduction(
       FunctorType const& functor,
@@ -197,7 +189,7 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, false> {
                         ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < ::Kokkos::Experimental::Impl::HIPTraits::WarpSize) {
-        ValueJoin::join(functor, value, value + delta);
+        functor.join(value, value + delta);
       }
     }
     *value = *(value - lane_id);
@@ -271,10 +263,10 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, false> {
     if (num_teams_done == gridDim.x) {
       is_last_block = true;
       *global_flags = 0;
-      ValueInit::init(functor, &value);
+      functor.init(&value);
       for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements;
            i += blockDim.x * blockDim.y) {
-        ValueJoin::join(functor, &value, &global_team_buffer_element[i]);
+        functor.join(&value, &global_team_buffer_element[i]);
       }
       scalar_intra_block_reduction(
           functor, value, false, shared_team_buffer_elements + (blockDim.y - 1),
@@ -290,75 +282,103 @@ struct HIPReductionsFunctor<FunctorType, ArgTag, false> {
 //----------------------------------------------------------------------------
 /*
  *  Algorithmic constraints:
- *   (a) blockDim.y is a power of two
- *   (b) blockDim.y <= 1024
- *   (c) blockDim.x == blockDim.z == 1
+ *   (a) blockDim.y <= 1024
+ *   (b) blockDim.x == blockDim.z == 1
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType>
 __device__ void hip_intra_block_reduce_scan(
     FunctorType const& functor,
-    typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type const
-        base_data) {
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
-
-  using pointer_type = typename ValueTraits::pointer_type;
-
-  unsigned int const value_count   = ValueTraits::value_count(functor);
-  unsigned int const BlockSizeMask = blockDim.y - 1;
-  int const WarpMask = Experimental::Impl::HIPTraits::WarpSize - 1;
-
-  // Must have power of two thread count
-  if ((blockDim.y - 1) & blockDim.y) {
-    Kokkos::abort(
-        "HIP::hip_intra_block_reduce_scan requires power-of-two "
-        "blockDim.y\n");
-  }
-
-  auto block_reduce_step =
-      [&functor, value_count](int const R, pointer_type const TD, int const S) {
-        if (R > ((1 << S) - 1)) {
-          ValueJoin::join(functor, TD, (TD - (value_count << S)));
-        }
-      };
+    typename FunctorType::pointer_type const base_data) {
+  using pointer_type = typename FunctorType::pointer_type;
+
+  const unsigned value_count = functor.length();
+  const unsigned not_less_power_of_two =
+      (1 << (Impl::int_log2(blockDim.y - 1) + 1));
+  const unsigned BlockSizeMask = not_less_power_of_two - 1;
+  // There is at most one warp that is neither completely full or empty.
+  // For that warp, we shift all indices logically to the end and ignore join
+  // operations with unassigned indices in the warp when performing the intra
+  // warp reduction/scan.
+  const bool is_full_warp =
+      (((threadIdx.y >> Experimental::Impl::HIPTraits::WarpIndexShift) + 1)
+       << Experimental::Impl::HIPTraits::WarpIndexShift) <= blockDim.y;
+
+  auto block_reduce_step = [&functor, value_count](
+                               int const R, pointer_type const TD, int const S,
+                               pointer_type memory_start, int index_shift) {
+    const auto join_ptr = TD - (value_count << S) + value_count * index_shift;
+    if (R > ((1 << S) - 1) && join_ptr >= memory_start) {
+      functor.join(TD, join_ptr);
+    }
+  };
 
-  {  // Intra-warp reduction:
-    const unsigned rtid_intra      = threadIdx.y & WarpMask;
+  // Intra-warp reduction:
+  {
+    const unsigned mapped_idx =
+        threadIdx.y + (is_full_warp
+                           ? 0
+                           : (not_less_power_of_two - blockDim.y) &
+                                 (Experimental::Impl::HIPTraits::WarpSize - 1));
     const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
-
-    block_reduce_step(rtid_intra, tdata_intra, 0);
-    block_reduce_step(rtid_intra, tdata_intra, 1);
-    block_reduce_step(rtid_intra, tdata_intra, 2);
-    block_reduce_step(rtid_intra, tdata_intra, 3);
-    block_reduce_step(rtid_intra, tdata_intra, 4);
-    block_reduce_step(rtid_intra, tdata_intra, 5);
+    const pointer_type warp_start =
+        base_data +
+        value_count *
+            ((threadIdx.y >> Experimental::Impl::HIPTraits::WarpIndexShift)
+             << Experimental::Impl::HIPTraits::WarpIndexShift);
+    block_reduce_step(mapped_idx, tdata_intra, 0, warp_start, 0);
+    block_reduce_step(mapped_idx, tdata_intra, 1, warp_start, 0);
+    block_reduce_step(mapped_idx, tdata_intra, 2, warp_start, 0);
+    block_reduce_step(mapped_idx, tdata_intra, 3, warp_start, 0);
+    block_reduce_step(mapped_idx, tdata_intra, 4, warp_start, 0);
+    block_reduce_step(mapped_idx, tdata_intra, 5, warp_start, 0);
   }
 
   __syncthreads();  // Wait for all warps to reduce
 
-  {  // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
-    unsigned int const rtid_inter =
-        ((threadIdx.y + 1) << Experimental::Impl::HIPTraits::WarpIndexShift) -
-        1;
-
-    if (rtid_inter < blockDim.y) {
-      pointer_type const tdata_inter = base_data + value_count * rtid_inter;
+  // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+  {
+    // There is at most one warp where the memory address to be used is not
+    // (HIPTraits::WarpSize - 1) away from the warp start adress. For the
+    // following reduction, we shift all indices logically to the end of the
+    // next power-of-two to the number of warps.
+    const unsigned n_active_warps =
+        ((blockDim.y - 1) >> Experimental::Impl::HIPTraits::WarpIndexShift) + 1;
+    if (threadIdx.y < n_active_warps) {
+      const bool is_full_warp_inter =
+          threadIdx.y <
+          (blockDim.y >> Experimental::Impl::HIPTraits::WarpIndexShift);
+      pointer_type const tdata_inter =
+          base_data +
+          value_count *
+              (is_full_warp_inter
+                   ? (threadIdx.y
+                      << Experimental::Impl::HIPTraits::WarpIndexShift) +
+                         (Experimental::Impl::HIPTraits::WarpSize - 1)
+                   : blockDim.y - 1);
+      const unsigned index_shift =
+          is_full_warp_inter
+              ? 0
+              : blockDim.y - (threadIdx.y
+                              << Experimental::Impl::HIPTraits::WarpIndexShift);
+      const int rtid_inter =
+          (threadIdx.y << Experimental::Impl::HIPTraits::WarpIndexShift) +
+          (Experimental::Impl::HIPTraits::WarpSize - 1) - index_shift;
 
       if ((1 << 6) < BlockSizeMask) {
-        block_reduce_step(rtid_inter, tdata_inter, 6);
+        block_reduce_step(rtid_inter, tdata_inter, 6, base_data, index_shift);
       }
       if ((1 << 7) < BlockSizeMask) {
-        block_reduce_step(rtid_inter, tdata_inter, 7);
+        block_reduce_step(rtid_inter, tdata_inter, 7, base_data, index_shift);
       }
       if ((1 << 8) < BlockSizeMask) {
-        block_reduce_step(rtid_inter, tdata_inter, 8);
+        block_reduce_step(rtid_inter, tdata_inter, 8, base_data, index_shift);
       }
       if ((1 << 9) < BlockSizeMask) {
-        block_reduce_step(rtid_inter, tdata_inter, 9);
+        block_reduce_step(rtid_inter, tdata_inter, 9, base_data, index_shift);
       }
       if ((1 << 10) < BlockSizeMask) {
-        block_reduce_step(rtid_inter, tdata_inter, 10);
+        block_reduce_step(rtid_inter, tdata_inter, 10, base_data, index_shift);
       }
     }
   }
@@ -368,12 +388,16 @@ __device__ void hip_intra_block_reduce_scan(
   if (DoScan) {
     // Update all the values for the respective warps (except for the last one)
     // by adding from the last value of the previous warp.
+    const unsigned int WarpMask = Experimental::Impl::HIPTraits::WarpSize - 1;
+    const int is_last_thread_in_warp =
+        is_full_warp ? ((threadIdx.y & WarpMask) ==
+                        Experimental::Impl::HIPTraits::WarpSize - 1)
+                     : (threadIdx.y == blockDim.y - 1);
     if (threadIdx.y >= Experimental::Impl::HIPTraits::WarpSize &&
-        (threadIdx.y & WarpMask) !=
-            Experimental::Impl::HIPTraits::WarpSize - 1) {
+        !is_last_thread_in_warp) {
       const int offset_to_previous_warp_total = (threadIdx.y & (~WarpMask)) - 1;
-      ValueJoin::join(functor, base_data + value_count * threadIdx.y,
-                      base_data + value_count * offset_to_previous_warp_total);
+      functor.join(base_data + value_count * threadIdx.y,
+                   base_data + value_count * offset_to_previous_warp_total);
     }
   }
 }
@@ -387,7 +411,7 @@ __device__ void hip_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType>
 __device__ bool hip_single_inter_block_reduce_scan_impl(
     FunctorType const& functor,
     ::Kokkos::Experimental::HIP::size_type const block_id,
@@ -395,13 +419,10 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
     ::Kokkos::Experimental::HIP::size_type* const shared_data,
     ::Kokkos::Experimental::HIP::size_type* const global_data,
     ::Kokkos::Experimental::HIP::size_type* const global_flags) {
-  using size_type   = ::Kokkos::Experimental::HIP::size_type;
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
-  using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
-  using ValueOps    = FunctorValueOps<FunctorType, ArgTag>;
+  using size_type = ::Kokkos::Experimental::HIP::size_type;
 
-  using pointer_type = typename ValueTraits::pointer_type;
+  using value_type   = typename FunctorType::value_type;
+  using pointer_type = typename FunctorType::pointer_type;
 
   // '__ffs' = position of the least significant bit set to 1.
   // 'blockDim.y' is guaranteed to be a power of two so this
@@ -416,13 +437,14 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
         "blockDim");
   }
 
-  integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                           sizeof(size_type)> const
-      word_count(ValueTraits::value_size(functor) / sizeof(size_type));
+  const integral_nonzero_constant<
+      size_type, std::is_pointer<typename FunctorType::reference_type>::value
+                     ? 0
+                     : sizeof(value_type) / sizeof(size_type)>
+      word_count((sizeof(value_type) * functor.length()) / sizeof(size_type));
 
   // Reduce the accumulation for the entire block.
-  hip_intra_block_reduce_scan<false, FunctorType, ArgTag>(
-      functor, pointer_type(shared_data));
+  hip_intra_block_reduce_scan<false>(functor, pointer_type(shared_data));
 
   {
     // Write accumulation total to global scratch space.
@@ -461,32 +483,34 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
                         BlockSizeShift;
 
     {
-      void* const shared_ptr = shared_data + word_count.value * threadIdx.y;
-      /* reference_type shared_value = */ ValueInit::init(functor, shared_ptr);
+      pointer_type const shared_data_thread = reinterpret_cast<pointer_type>(
+          shared_data + word_count.value * threadIdx.y);
+      /* reference_type shared_value = */ functor.init(shared_data_thread);
 
       for (size_type i = b; i < e; ++i) {
-        ValueJoin::join(functor, shared_ptr,
-                        global_data + word_count.value * i);
+        functor.join(
+            shared_data_thread,
+            reinterpret_cast<pointer_type>(global_data + word_count.value * i));
       }
     }
 
-    hip_intra_block_reduce_scan<DoScan, FunctorType, ArgTag>(
-        functor, pointer_type(shared_data));
+    hip_intra_block_reduce_scan<DoScan>(functor, pointer_type(shared_data));
 
     if (DoScan) {
-      size_type* const shared_value =
+      pointer_type const shared_value = reinterpret_cast<pointer_type>(
           shared_data +
-          word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y);
+          word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y));
 
       if (!threadIdx.y) {
-        ValueInit::init(functor, shared_value);
+        functor.init(shared_value);
       }
 
       // Join previous inclusive scan value to each member
       for (size_type i = b; i < e; ++i) {
-        size_type* const global_value = global_data + word_count.value * i;
-        ValueJoin::join(functor, shared_value, global_value);
-        ValueOps::copy(functor, global_value, shared_value);
+        pointer_type const global_value =
+            reinterpret_cast<pointer_type>(global_data + word_count.value * i);
+        functor.join(shared_value, global_value);
+        functor.copy(global_value, shared_value);
       }
     }
   }
@@ -494,7 +518,7 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
   return is_last_block;
 }
 
-template <bool DoScan, typename FunctorType, typename ArgTag>
+template <bool DoScan, typename FunctorType>
 __device__ bool hip_single_inter_block_reduce_scan(
     FunctorType const& functor,
     ::Kokkos::Experimental::HIP::size_type const block_id,
@@ -502,29 +526,45 @@ __device__ bool hip_single_inter_block_reduce_scan(
     ::Kokkos::Experimental::HIP::size_type* const shared_data,
     ::Kokkos::Experimental::HIP::size_type* const global_data,
     ::Kokkos::Experimental::HIP::size_type* const global_flags) {
-  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
-  // If we are doing a reduction and StaticValueSize is true, we use the
+  // If we are doing a reduction and we don't do an array reduction, we use the
   // reduction-only path. Otherwise, we use the common path between reduction
   // and scan.
-  if (!DoScan && static_cast<bool>(ValueTraits::StaticValueSize))
+  if (!DoScan && !std::is_pointer<typename FunctorType::reference_type>::value)
     // FIXME_HIP_PERFORMANCE I don't know where 16 comes from. This inequality
     // determines if we use shared memory (false) or shuffle (true)
     return Kokkos::Impl::HIPReductionsFunctor<
-        FunctorType, ArgTag, (ValueTraits::StaticValueSize > 16)>::
-        scalar_inter_block_reduction(functor, block_count, shared_data,
-                                     global_data, global_flags);
+        FunctorType, (sizeof(typename FunctorType::value_type) >
+                      16)>::scalar_inter_block_reduction(functor, block_count,
+                                                         shared_data,
+                                                         global_data,
+                                                         global_flags);
   else {
-    return hip_single_inter_block_reduce_scan_impl<DoScan, FunctorType, ArgTag>(
+    return hip_single_inter_block_reduce_scan_impl<DoScan>(
         functor, block_id, block_count, shared_data, global_data, global_flags);
   }
 }
 
 // Size in bytes required for inter block reduce or scan
 template <bool DoScan, class FunctorType, class ArgTag>
-inline unsigned hip_single_inter_block_reduce_scan_shmem(
-    const FunctorType& functor, const unsigned BlockSize) {
-  return (BlockSize + 2) *
-         Impl::FunctorValueTraits<FunctorType, ArgTag>::value_size(functor);
+inline std::enable_if_t<DoScan, unsigned>
+hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
+                                         const unsigned BlockSize) {
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         RangePolicy<Experimental::HIP, ArgTag>,
+                                         FunctorType>;
+
+  return (BlockSize + 2) * Analysis::value_size(functor);
+}
+
+template <bool DoScan, class FunctorType, class ArgTag>
+inline std::enable_if_t<!DoScan, unsigned>
+hip_single_inter_block_reduce_scan_shmem(const FunctorType& functor,
+                                         const unsigned BlockSize) {
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         RangePolicy<Experimental::HIP, ArgTag>,
+                                         FunctorType>;
+
+  return (BlockSize + 2) * Analysis::value_size(functor);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
index 5a27e2e0d..eb85ed470 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
@@ -61,11 +61,9 @@ namespace Impl {
  *   (b) blockDim.x == power of two
  *   (x) blockDim.z == 1
  */
-template <typename ValueType, typename JoinOp,
-          typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
-                                  int>::type = 0>
+template <typename ValueType, typename ReducerType>
 __device__ inline void hip_intra_warp_shuffle_reduction(
-    ValueType& result, JoinOp const& join,
+    ValueType& result, ReducerType const& reducer,
     uint32_t const max_active_thread = blockDim.y) {
   unsigned int shift = 1;
 
@@ -78,7 +76,7 @@ __device__ inline void hip_intra_warp_shuffle_reduction(
     // Only join if upper thread is active (this allows non power of two for
     // blockDim.y)
     if (threadIdx.y + shift < max_active_thread) {
-      join(result, tmp);
+      reducer.join(&result, &tmp);
     }
     shift *= 2;
   }
@@ -87,11 +85,9 @@ __device__ inline void hip_intra_warp_shuffle_reduction(
   result = Kokkos::Experimental::shfl(result, 0, warp_size);
 }
 
-template <typename ValueType, typename JoinOp,
-          typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
-                                  int>::type = 0>
+template <typename ValueType, typename ReducerType>
 __device__ inline void hip_inter_warp_shuffle_reduction(
-    ValueType& value, const JoinOp& join,
+    ValueType& value, const ReducerType& reducer,
     const int max_active_thread = blockDim.y) {
   unsigned int constexpr warp_size =
       Kokkos::Experimental::Impl::HIPTraits::WarpSize;
@@ -111,7 +107,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
   __syncthreads();
   while (shift <= max_active_thread / step) {
     if (shift <= id && shift + step_width > id && threadIdx.x == 0) {
-      join(result[id % step_width], value);
+      reducer.join(&result[id % step_width], &value);
     }
     __syncthreads();
     shift += step_width;
@@ -119,37 +115,31 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
 
   value = result[0];
   for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
-    join(value, result[i]);
+    reducer.join(&value, &result[i]);
 }
 
-template <typename ValueType, typename JoinOp,
-          typename std::enable_if<!Kokkos::is_reducer<ValueType>::value,
-                                  int>::type = 0>
+template <typename ValueType, typename ReducerType>
 __device__ inline void hip_intra_block_shuffle_reduction(
-    ValueType& value, JoinOp const& join,
+    ValueType& value, ReducerType const& reducer,
     int const max_active_thread = blockDim.y) {
-  hip_intra_warp_shuffle_reduction(value, join, max_active_thread);
-  hip_inter_warp_shuffle_reduction(value, join, max_active_thread);
+  hip_intra_warp_shuffle_reduction(value, reducer, max_active_thread);
+  hip_inter_warp_shuffle_reduction(value, reducer, max_active_thread);
 }
 
-template <class FunctorType, class JoinOp, class ArgTag = void>
+template <class FunctorType>
 __device__ inline bool hip_inter_block_shuffle_reduction(
-    typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value,
-    typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral,
-    JoinOp const& join,
+    typename FunctorType::reference_type value,
+    typename FunctorType::reference_type neutral, FunctorType const& reducer,
     Kokkos::Experimental::HIP::size_type* const m_scratch_space,
-    typename FunctorValueTraits<FunctorType,
-                                ArgTag>::pointer_type const /*result*/,
+    typename FunctorType::pointer_type const /*result*/,
     Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
     int const max_active_thread = blockDim.y) {
-  using pointer_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type;
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
+  using pointer_type = typename FunctorType::pointer_type;
+  using value_type   = typename FunctorType::value_type;
 
   // Do the intra-block reduction with shfl operations for the intra warp
   // reduction and static shared memory for the inter warp reduction
-  hip_intra_block_shuffle_reduction(value, join, max_active_thread);
+  hip_intra_block_shuffle_reduction(value, reducer, max_active_thread);
 
   int const id = threadIdx.y * blockDim.x + threadIdx.x;
 
@@ -188,7 +178,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction(
                                 : warp_size;
       for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
         value_type tmp = global[i];
-        join(value, tmp);
+        reducer.join(&value, &tmp);
       }
 
       // Perform shfl reductions within the warp only join if contribution is
@@ -196,7 +186,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction(
       for (unsigned int i = 1; i < warp_size; i *= 2) {
         if ((blockDim.x * blockDim.y) > i) {
           value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
-          if (id + i < gridDim.x) join(value, tmp);
+          if (id + i < gridDim.x) reducer.join(&value, &tmp);
         }
       }
     }
@@ -205,130 +195,6 @@ __device__ inline bool hip_inter_block_shuffle_reduction(
   // "value"
   return last_block;
 }
-
-// We implemente the same functions as above but the user provide a Reducer
-// instead of JoinOP
-template <typename ReducerType,
-          typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
-                                  int>::type = 0>
-__device__ inline void hip_intra_warp_shuffle_reduction(
-    const ReducerType& reducer, typename ReducerType::value_type& result,
-    const uint32_t max_active_thread = blockDim.y) {
-  using ValueType = typename ReducerType::value_type;
-  auto join_op    = [&](ValueType& result, ValueType const& tmp) {
-    reducer.join(result, tmp);
-  };
-  hip_intra_warp_shuffle_reduction(result, join_op, max_active_thread);
-
-  reducer.reference() = result;
-}
-
-template <typename ReducerType,
-          typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
-                                  int>::type = 0>
-__device__ inline void hip_inter_warp_shuffle_reduction(
-    ReducerType const& reducer, typename ReducerType::value_type value,
-    int const max_active_thread = blockDim.y) {
-  using ValueType = typename ReducerType::value_type;
-  auto join_op    = [&](ValueType& a, ValueType& b) { reducer.join(a, b); };
-  hip_inter_warp_shuffle_reduction(value, join_op, max_active_thread);
-
-  reducer.reference() = value;
-}
-
-template <typename ReducerType,
-          typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
-                                  int>::type = 0>
-__device__ inline void hip_intra_block_shuffle_reduction(
-    ReducerType const& reducer, typename ReducerType::value_type value,
-    int const max_active_thread = blockDim.y) {
-  hip_intra_warp_shuffle_reduction(reducer, value, max_active_thread);
-  hip_inter_warp_shuffle_reduction(reducer, value, max_active_thread);
-}
-
-template <typename ReducerType,
-          typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
-                                  int>::type = 0>
-__device__ inline void hip_intra_block_shuffle_reduction(
-    ReducerType const& reducer, int const max_active_thread = blockDim.y) {
-  hip_intra_block_shuffle_reduction(reducer, reducer.reference(),
-                                    max_active_thread);
-}
-
-template <typename ReducerType,
-          typename std::enable_if<Kokkos::is_reducer<ReducerType>::value,
-                                  int>::type = 0>
-__device__ inline bool hip_inter_block_shuffle_reduction(
-    ReducerType const& reducer,
-    Kokkos::Experimental::HIP::size_type* const m_scratch_space,
-    Kokkos::Experimental::HIP::size_type* const m_scratch_flags,
-    int const max_active_thread = blockDim.y) {
-  using pointer_type = typename ReducerType::value_type*;
-  using value_type   = typename ReducerType::value_type;
-
-  // Do the intra-block reduction with shfl operations for the intra warp
-  // reduction and static shared memory for the inter warp reduction
-  hip_intra_block_shuffle_reduction(reducer, max_active_thread);
-
-  value_type value = reducer.reference();
-
-  int const id = threadIdx.y * blockDim.x + threadIdx.x;
-
-  // One thread in the block writes block result to global scratch_memory
-  if (id == 0) {
-    pointer_type global =
-        reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
-    *global = value;
-  }
-
-  // One warp of last block performs inter block reduction through loading the
-  // block values from global scratch_memory
-  bool last_block = false;
-
-  __syncthreads();
-  int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-  if (id < warp_size) {
-    Kokkos::Experimental::HIP::size_type count;
-
-    // Figure out whether this is the last block
-    if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1);
-    count = Kokkos::Experimental::shfl(count, 0, warp_size);
-
-    // Last block does the inter block reduction
-    if (count == gridDim.x - 1) {
-      // Set flag back to zero
-      if (id == 0) *m_scratch_flags = 0;
-      last_block = true;
-      reducer.init(value);
-
-      pointer_type const global =
-          reinterpret_cast<pointer_type>(m_scratch_space);
-
-      // Reduce all global values with splitting work over threads in one warp
-      int const step_size = blockDim.x * blockDim.y < warp_size
-                                ? blockDim.x * blockDim.y
-                                : warp_size;
-      for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) {
-        value_type tmp = global[i];
-        reducer.join(value, tmp);
-      }
-
-      // Perform shfl reductions within the warp only join if contribution is
-      // valid (allows gridDim.x non power of two and <warp_size)
-      for (unsigned int i = 1; i < warp_size; i *= 2) {
-        if ((blockDim.x * blockDim.y) > i) {
-          value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size);
-          if (id + i < gridDim.x) reducer.join(value, tmp);
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  // The last block has in its thread = 0 the global reduction value through
-  // "value"
-  return last_block;
-}
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 776b7c6ab..aee9756af 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_Core.hpp>
@@ -50,16 +54,30 @@
 
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 #include <stdlib.h>
 #include <iostream>
 #include <sstream>
-#include <stdexcept>
 #include <algorithm>
 #include <atomic>
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
+namespace {
+
+static std::atomic<bool> is_first_hip_managed_allocation(true);
+
+bool hip_driver_check_page_migration(int deviceId) {
+  // check with driver if page migrating memory is available
+  // this driver query is copied from the hip documentation
+  int hasManagedMemory = 0;  // false by default
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceGetAttribute(
+      &hasManagedMemory, hipDeviceAttributeManagedMemory, deviceId));
+  return static_cast<bool>(hasManagedMemory);
+}
+}  // namespace
 namespace Kokkos {
 namespace Impl {
 
@@ -131,6 +149,8 @@ HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {}
 
 HIPHostPinnedSpace::HIPHostPinnedSpace() {}
 
+HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {}
+
 void* HIPSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
 }
@@ -179,7 +199,8 @@ void* HIPHostPinnedSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void* ptr = nullptr;
 
-  auto const error_code = hipHostMalloc(&ptr, arg_alloc_size);
+  auto const error_code =
+      hipHostMalloc(&ptr, arg_alloc_size, hipHostMallocNonCoherent);
   if (error_code != hipSuccess) {
     // This is the only way to clear the last error, which we should do here
     // since we're turning it into an exception here
@@ -196,6 +217,73 @@ void* HIPHostPinnedSpace::impl_allocate(
 
   return ptr;
 }
+
+void* HIPManagedSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* HIPManagedSpace::allocate(const char* arg_label,
+                                const size_t arg_alloc_size,
+                                const size_t arg_logical_size) const {
+  return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
+}
+void* HIPManagedSpace::impl_allocate(
+    const char* arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  void* ptr = nullptr;
+
+  if (arg_alloc_size > 0) {
+    if (is_first_hip_managed_allocation.exchange(false) &&
+        Kokkos::show_warnings()) {
+      if (!hip_driver_check_page_migration(m_device)) {
+        std::cerr << R"warning(
+Kokkos::HIP::allocation WARNING: The combination of device and system configuration
+                                 does not support page migration between device and host.
+                                 HIPManagedSpace might not work as expected.
+                                 Please refer to the ROCm documentation on unified/managed memory.)warning"
+                  << std::endl;
+      }
+
+      // check for correct runtime environment
+      const char* hsa_xnack = std::getenv("HSA_XNACK");
+      if (!hsa_xnack)
+        std::cerr << R"warning(
+Kokkos::HIP::runtime WARNING: Kokkos did not find an environment variable 'HSA_XNACK'
+                              for the current process.
+                              Nevertheless, xnack is enabled for all processes if
+                              amdgpu.noretry=0 was set in the Linux kernel boot line.
+                              Without xnack enabled, Kokkos::HIPManaged might not behave
+                              as expected.)warning"
+                  << std::endl;
+      else if (Kokkos::Impl::strcmp(hsa_xnack, "1") != 0)
+        std::cerr << "Kokkos::HIP::runtime WARNING: Kokkos detected the "
+                     "environement variable "
+                  << "'HSA_XNACK=" << hsa_xnack << "\n"
+                  << "Kokkos advises to set it to '1' to enable it per process."
+                  << std::endl;
+    }
+    auto const error_code = hipMallocManaged(&ptr, arg_alloc_size);
+    if (error_code != hipSuccess) {
+      // This is the only way to clear the last error, which we should do here
+      // since we're turning it into an exception here
+      (void)hipGetLastError();
+      throw HIPRawMemoryAllocationFailure(
+          arg_alloc_size, error_code,
+          RawMemoryAllocationFailure::AllocationMechanism::HIPMallocManaged);
+    }
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemAdvise(
+        ptr, arg_alloc_size, hipMemAdviseSetCoarseGrain, m_device));
+  }
+
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
+  }
+
+  return ptr;
+}
+
 void HIPSpace::deallocate(void* const arg_alloc_ptr,
                           const size_t arg_alloc_size) const {
   deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
@@ -242,6 +330,35 @@ void HIPHostPinnedSpace::impl_deallocate(
   KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }
 
+void HIPManagedSpace::deallocate(void* const arg_alloc_ptr,
+                                 const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void HIPManagedSpace::deallocate(const char* arg_label,
+                                 void* const arg_alloc_ptr,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
+}
+void HIPManagedSpace::impl_deallocate(
+    const char* arg_label, void* const arg_alloc_ptr,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
+                                      reported_size);
+  }
+  // We have to unset the CoarseGrain property manually as hipFree does not take
+  // care of it. Otherwise, the allocation would continue to linger in the
+  // kernel mem page table.
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemAdvise(
+      arg_alloc_ptr, arg_alloc_size, hipMemAdviseUnsetCoarseGrain, m_device));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -257,6 +374,9 @@ SharedAllocationRecord<void, void>
 
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::HIPManagedSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
@@ -274,6 +394,13 @@ SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace,
                      SharedAllocationRecord<void, void>::m_alloc_size);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::HIPManagedSpace,
+                       void>::~SharedAllocationRecord() {
+  m_space.deallocate(m_label.c_str(),
+                     SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     SharedAllocationRecord<void, void>::m_alloc_size);
+}
+
 SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::HIPSpace& arg_space,
@@ -306,6 +433,35 @@ SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
       "HostSpace");
 }
 
+SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::HIP& arg_exec_space,
+        const Kokkos::Experimental::HIPSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::HIPSpace,
+                                  void>::s_root_record,
+#endif
+          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                       arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_space(arg_space) {
+
+  SharedAllocationHeader header;
+
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace>(
+      arg_exec_space, RecordBase::m_alloc_ptr, &header,
+      sizeof(SharedAllocationHeader));
+}
+
 SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
@@ -328,15 +484,34 @@ SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::HIPManagedSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::HIPManagedSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::HIPManagedSpace,
+                                  void>::s_root_record,
+#endif
+          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                       arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_space(arg_space) {
+  // Fill in the Header information, directly accessible via managed memory
+  this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
-namespace Impl {
-int get_gpu(const InitArguments& args);
-}
 namespace Experimental {
 
 int HIP::concurrency() {
@@ -347,8 +522,8 @@ int HIP::impl_is_initialized() {
   return Impl::HIPInternal::singleton().is_initialized();
 }
 
-void HIP::impl_initialize(const HIP::SelectDevice config) {
-  Impl::HIPInternal::singleton().initialize(config.hip_device_id);
+void HIP::impl_initialize(InitializationSettings const& settings) {
+  Impl::HIPInternal::singleton().initialize(::Kokkos::Impl::get_gpu(settings));
 }
 
 void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); }
@@ -371,8 +546,21 @@ HIP::HIP(hipStream_t const stream, bool manage_stream)
                                manage_stream);
 }
 
-void HIP::print_configuration(std::ostream& s, const bool) {
-  Impl::HIPInternal::singleton().print_configuration(s);
+void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const {
+  os << "Device Execution Space:\n";
+  os << "  KOKKOS_ENABLE_HIP: yes\n";
+
+  os << "HIP Options:\n";
+  os << "  KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+
+  os << "\nRuntime Configuration:\n";
+
+  m_space_instance->print_configuration(os);
 }
 
 uint32_t HIP::impl_instance_id() const noexcept {
@@ -386,16 +574,10 @@ void HIP::impl_static_fence(const std::string& name) {
           GlobalDeviceSynchronization,
       [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
 }
-void HIP::impl_static_fence() {
-  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
-}
 
 void HIP::fence(const std::string& name) const {
   m_space_instance->fence(name);
 }
-void HIP::fence() const {
-  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
-}
 
 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }
 
@@ -412,56 +594,7 @@ const char* HIP::name() { return "HIP"; }
 namespace Impl {
 
 int g_hip_space_factory_initialized =
-    initialize_space_factory<HIPSpaceInitializer>("150_HIP");
-
-void HIPSpaceInitializer::initialize(const InitArguments& args) {
-  int use_gpu = Impl::get_gpu(args);
-
-  if (std::is_same<Kokkos::Experimental::HIP,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      0 < use_gpu) {
-    if (use_gpu > -1) {
-      Kokkos::Experimental::HIP::impl_initialize(
-          Kokkos::Experimental::HIP::SelectDevice(use_gpu));
-    } else {
-      Kokkos::Experimental::HIP::impl_initialize();
-    }
-  }
-}
-
-void HIPSpaceInitializer::finalize(const bool all_spaces) {
-  if (std::is_same<Kokkos::Experimental::HIP,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      all_spaces) {
-    if (Kokkos::Experimental::HIP::impl_is_initialized())
-      Kokkos::Experimental::HIP::impl_finalize();
-  }
-}
-
-void HIPSpaceInitializer::fence() {
-  Kokkos::Experimental::HIP::impl_static_fence();
-}
-void HIPSpaceInitializer::fence(const std::string& name) {
-  Kokkos::Experimental::HIP::impl_static_fence(name);
-}
-
-void HIPSpaceInitializer::print_configuration(std::ostream& msg,
-                                              const bool detail) {
-  msg << "Devices:" << std::endl;
-  msg << "  KOKKOS_ENABLE_HIP: ";
-  msg << "yes" << std::endl;
-
-  msg << "HIP Options:" << std::endl;
-  msg << "  KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE: ";
-#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-
-  msg << "\nRuntime Configuration:" << std::endl;
-  Experimental::HIP::print_configuration(msg, detail);
-}
+    initialize_space_factory<::Kokkos::Experimental::HIP>("150_HIP");
 
 }  // namespace Impl
 
@@ -491,6 +624,8 @@ template class HostInaccessibleSharedAllocationRecordCommon<
 template class SharedAllocationRecordCommon<Kokkos::Experimental::HIPSpace>;
 template class SharedAllocationRecordCommon<
     Kokkos::Experimental::HIPHostPinnedSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::HIPManagedSpace>;
 
 }  // end namespace Impl
 }  // end namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index fb67a25c5..9ddfa5f65 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -69,8 +69,7 @@ struct HIPJoinFunctor {
   using value_type = Type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   volatile const value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update += input;
   }
 };
@@ -199,19 +198,21 @@ class HIPTeamMember {
    *    ( 1 == blockDim.z )
    */
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
     team_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer,
-                  typename ReducerType::value_type& value) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer,
+              typename ReducerType::value_type& value) const noexcept {
 #ifdef __HIP_DEVICE_COMPILE__
-    hip_intra_block_shuffle_reduction(reducer, value, blockDim.y);
+    typename Kokkos::Impl::FunctorAnalysis<
+        FunctorPatternInterface::REDUCE, TeamPolicy<Experimental::HIP>,
+        ReducerType>::Reducer wrapped_reducer(&reducer);
+    hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y);
+    reducer.reference() = value;
 #else
     (void)reducer;
     (void)value;
@@ -243,8 +244,11 @@ class HIPTeamMember {
 
     base_data[threadIdx.y + 1] = value;
 
-    Impl::hip_intra_block_reduce_scan<true, Impl::HIPJoinFunctor<Type>, void>(
-        Impl::HIPJoinFunctor<Type>(), base_data + 1);
+    Impl::HIPJoinFunctor<Type> hip_join_functor;
+    typename Kokkos::Impl::FunctorAnalysis<
+        FunctorPatternInterface::REDUCE, TeamPolicy<Experimental::HIP>,
+        Impl::HIPJoinFunctor<Type>>::Reducer reducer(&hip_join_functor);
+    Impl::hip_intra_block_reduce_scan<true>(reducer, base_data + 1);
 
     if (global_accum) {
       if (blockDim.y == threadIdx.y + 1) {
@@ -276,17 +280,15 @@ class HIPTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer) {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer,
-                    typename ReducerType::value_type& value) {
+  KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer,
+                typename ReducerType::value_type& value) {
 #ifdef __HIP_DEVICE_COMPILE__
     if (blockDim.x == 1) return;
 
@@ -320,9 +322,10 @@ class HIPTeamMember {
   // Private for the driver
 
   KOKKOS_INLINE_FUNCTION
-  HIPTeamMember(void* shared, const int shared_begin, const int shared_size,
-                void* scratch_level_1_ptr, const int scratch_level_1_size,
-                const int arg_league_rank, const int arg_league_size)
+  HIPTeamMember(void* shared, const size_t shared_begin,
+                const size_t shared_size, void* scratch_level_1_ptr,
+                const size_t scratch_level_1_size, const int arg_league_rank,
+                const int arg_league_size)
       : m_team_reduce(shared),
         m_team_shared(((char*)shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
@@ -419,9 +422,9 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HIPTeamMember>
 TeamThreadRange(const Impl::HIPTeamMember& thread, iType1 begin, iType2 end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -436,10 +439,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HIPTeamMember>
 TeamVectorRange(const Impl::HIPTeamMember& thread, const iType1& begin,
                 const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -454,10 +457,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HIPTeamMember>
 ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin,
                   iType2 arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>(
       thread, iType(arg_begin), iType(arg_end));
 }
@@ -508,11 +511,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  performed and put into result.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
 #ifdef __HIP_DEVICE_COMPILE__
   typename ReducerType::value_type value;
   reducer.init(value);
@@ -539,11 +541,10 @@ KOKKOS_INLINE_FUNCTION
  *  performed and put into result.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
 #ifdef __HIP_DEVICE_COMPILE__
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
@@ -626,11 +627,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
 }
 
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
 #ifdef __HIP_DEVICE_COMPILE__
   typename ReducerType::value_type value;
   reducer.init(value);
@@ -650,11 +650,10 @@ KOKKOS_INLINE_FUNCTION
 }
 
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
 #ifdef __HIP_DEVICE_COMPILE__
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
@@ -714,11 +713,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  constructed value.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<is_reducer<ReducerType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember> const& loop_boundaries,
-                    Closure const& closure, ReducerType const& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember> const& loop_boundaries,
+                Closure const& closure, ReducerType const& reducer) {
 #ifdef __HIP_DEVICE_COMPILE__
   reducer.init(reducer.reference());
 
@@ -747,11 +745,10 @@ KOKKOS_INLINE_FUNCTION
  *  constructed value.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!is_reducer<ValueType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::HIPTeamMember> const& loop_boundaries,
-                    Closure const& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!is_reducer<ValueType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::HIPTeamMember> const& loop_boundaries,
+                Closure const& closure, ValueType& result) {
 #ifdef __HIP_DEVICE_COMPILE__
   result = ValueType();
 
@@ -779,11 +776,10 @@ KOKKOS_INLINE_FUNCTION
  *  The last call to closure has final == true.
  */
 template <typename iType, class Closure, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
-                      iType, Impl::HIPTeamMember>& loop_boundaries,
-                  const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                  iType, Impl::HIPTeamMember>& loop_boundaries,
+              const Closure& closure, const ReducerType& reducer) {
 #ifdef __HIP_DEVICE_COMPILE__
   using value_type = typename ReducerType::value_type;
   value_type accum;
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
index 99f61ed36..a0722f618 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp
@@ -103,8 +103,7 @@ class UniqueToken<HIP, UniqueTokenScope::Global> {
   size_type size() const noexcept { return m_locks.extent(0); }
 
  private:
-  // FIXME_HIP
-  KOKKOS_INLINE_FUNCTION size_type impl_acquire() const {
+  __device__ size_type impl_acquire() const {
     int idx = blockIdx.x * (blockDim.x * blockDim.y) +
               threadIdx.y * blockDim.x + threadIdx.x;
     idx                            = idx % size();
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
index a6c65ee5e..18b5f57c2 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp
@@ -68,8 +68,8 @@ struct in_place_shfl_op {
   // sizeof(Scalar) < sizeof(int) case
   template <class Scalar>
   // requires _assignable_from_bits<Scalar>
-  __device__ inline typename std::enable_if<sizeof(Scalar) < sizeof(int)>::type
-  operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const
+  __device__ inline std::enable_if_t<sizeof(Scalar) < sizeof(int)> operator()(
+      Scalar& out, Scalar const& in, int lane_or_delta, int width) const
       noexcept {
     using shfl_type = int;
     union conv_type {
@@ -93,28 +93,26 @@ struct in_place_shfl_op {
   // sizeof(Scalar) == sizeof(int) case
   template <class Scalar>
   // requires _assignable_from_bits<Scalar>
-  __device__ inline typename std::enable_if<sizeof(Scalar) == sizeof(int)>::type
-  operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const
+  __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(int)> operator()(
+      Scalar& out, Scalar const& in, int lane_or_delta, int width) const
       noexcept {
     reinterpret_cast<int&>(out) = self().do_shfl_op(
         reinterpret_cast<int const&>(in), lane_or_delta, width);
   }
 
   template <class Scalar>
-  __device__ inline
-      typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type
-      operator()(Scalar& out, Scalar const& in, int lane_or_delta,
-                 int width) const noexcept {
+  __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(double)>
+  operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const
+      noexcept {
     reinterpret_cast<double&>(out) = self().do_shfl_op(
         *reinterpret_cast<double const*>(&in), lane_or_delta, width);
   }
 
   // sizeof(Scalar) > sizeof(double) case
   template <typename Scalar>
-  __device__ inline
-      typename std::enable_if<(sizeof(Scalar) > sizeof(double))>::type
-      operator()(Scalar& out, const Scalar& val, int lane_or_delta,
-                 int width) const noexcept {
+  __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))>
+  operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const
+      noexcept {
     using shuffle_as_t = int;
     int constexpr N    = sizeof(Scalar) / sizeof(shuffle_as_t);
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp
index 3e053d8f1..081f6f404 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp
@@ -62,16 +62,14 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_one(const std::int32_t w) const noexcept {
+  __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_one(
+      const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  __device__ inline
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_one(const std::int32_t w) const noexcept {
+  __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_one(
+      const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
   }
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 623c7da02..6027ead01 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -42,11 +42,17 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 
 #ifdef KOKKOS_ENABLE_HPX
 #include <Kokkos_HPX.hpp>
 
+#include <impl/Kokkos_ExecSpaceManager.hpp>
+
 #include <hpx/local/condition_variable.hpp>
 #include <hpx/local/init.hpp>
 #include <hpx/local/thread.hpp>
@@ -87,26 +93,7 @@ int HPX::concurrency() {
   }
 }
 
-void HPX::impl_initialize(int thread_count) {
-  hpx::runtime *rt = hpx::get_runtime_ptr();
-  if (rt == nullptr) {
-    hpx::local::init_params i;
-    i.cfg = {
-        "hpx.os_threads=" + std::to_string(thread_count),
-#ifdef KOKKOS_ENABLE_DEBUG
-        "--hpx:attach-debugger=exception",
-#endif
-    };
-    int argc_hpx     = 1;
-    char name[]      = "kokkos_hpx";
-    char *argv_hpx[] = {name, nullptr};
-    hpx::local::start(nullptr, argc_hpx, argv_hpx, i);
-
-    m_hpx_initialized = true;
-  }
-}
-
-void HPX::impl_initialize() {
+void HPX::impl_initialize(InitializationSettings const &settings) {
   hpx::runtime *rt = hpx::get_runtime_ptr();
   if (rt == nullptr) {
     hpx::local::init_params i;
@@ -115,6 +102,10 @@ void HPX::impl_initialize() {
         "--hpx:attach-debugger=exception",
 #endif
     };
+    if (settings.has_num_threads()) {
+      i.cfg.emplace_back("hpx.os_threads=" +
+                         std::to_string(settings.get_num_threads()));
+    }
     int argc_hpx     = 1;
     char name[]      = "kokkos_hpx";
     char *argv_hpx[] = {name, nullptr};
@@ -148,55 +139,7 @@ void HPX::impl_finalize() {
 namespace Impl {
 
 int g_hpx_space_factory_initialized =
-    initialize_space_factory<HPXSpaceInitializer>("060_HPX");
-
-void HPXSpaceInitializer::initialize(const InitArguments &args) {
-  const int num_threads = args.num_threads;
-
-  if (std::is_same<Kokkos::Experimental::HPX,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      std::is_same<Kokkos::Experimental::HPX,
-                   Kokkos::HostSpace::execution_space>::value) {
-    if (num_threads > 0) {
-      Kokkos::Experimental::HPX::impl_initialize(num_threads);
-    } else {
-      Kokkos::Experimental::HPX::impl_initialize();
-    }
-    // std::cout << "Kokkos::initialize() fyi: HPX enabled and initialized" <<
-    // std::endl ;
-  } else {
-    // std::cout << "Kokkos::initialize() fyi: HPX enabled but not initialized"
-    // << std::endl ;
-  }
-}
-
-void HPXSpaceInitializer::finalize(const bool all_spaces) {
-  if (std::is_same<Kokkos::Experimental::HPX,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      std::is_same<Kokkos::Experimental::HPX,
-                   Kokkos::HostSpace::execution_space>::value ||
-      all_spaces) {
-    if (Kokkos::Experimental::HPX::impl_is_initialized())
-      Kokkos::Experimental::HPX::impl_finalize();
-  }
-}
-
-void HPXSpaceInitializer::fence(const std::string &name) {
-  Kokkos::Experimental::HPX::impl_fence_global(name);
-}
-void HPXSpaceInitializer::fence() {
-  Kokkos::Experimental::HPX::impl_fence_global();
-}
-
-void HPXSpaceInitializer::print_configuration(std::ostream &msg,
-                                              const bool detail) {
-  msg << "HPX Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_HPX: ";
-  msg << "yes" << std::endl;
-
-  msg << "\nHPX Runtime Configuration:" << std::endl;
-  Kokkos::Experimental::HPX::print_configuration(msg, detail);
-}
+    initialize_space_factory<Kokkos::Experimental::HPX>("060_HPX");
 
 }  // namespace Impl
 
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp
index 8d42589bd..e61ac728a 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG)
 
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index d77b1c2c7..67765a6ae 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -157,9 +157,9 @@ class TaskQueueSpecialization<
 
 template <class Scheduler>
 class TaskQueueSpecializationConstrained<
-    Scheduler, typename std::enable_if<
-                   std::is_same<typename Scheduler::execution_space,
-                                Kokkos::Experimental::HPX>::value>::type> {
+    Scheduler,
+    std::enable_if_t<std::is_same<typename Scheduler::execution_space,
+                                  Kokkos::Experimental::HPX>::value>> {
  public:
   using execution_space = Kokkos::Experimental::HPX;
   using scheduler_type  = Scheduler;
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index a3d4a6a60..5f2eff577 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -64,14 +64,14 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_functor(const std::int32_t w) const noexcept {
+  std::enable_if_t<std::is_void<TagType>::value> execute_functor(
+      const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_functor(const std::int32_t w) const noexcept {
+  std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
+      const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
   }
diff --git a/packages/kokkos/core/src/KokkosExp_InterOp.hpp b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
index 37c2088f8..0522ad7e8 100644
--- a/packages/kokkos/core/src/KokkosExp_InterOp.hpp
+++ b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_CORE_EXP_INTEROP_HPP
 #define KOKKOS_CORE_EXP_INTEROP_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_INTEROP
+#endif
 
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Layout.hpp>
@@ -144,4 +148,8 @@ auto as_python_type(Tp&& _v) {
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_INTEROP
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_INTEROP
+#endif
 #endif
diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index dfae7451f..64b31c7fe 100644
--- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 
@@ -190,7 +199,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   template <class... OtherProperties>
   friend struct MDRangePolicy;
 
-  static_assert(!std::is_same<typename traits::iteration_pattern, void>::value,
+  static_assert(!std::is_void<typename traits::iteration_pattern>::value,
                 "Kokkos Error: MD iteration pattern not defined");
 
   using iteration_pattern = typename traits::iteration_pattern;
diff --git a/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp b/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp
index d6227b7bc..4a22aedd8 100644
--- a/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp
+++ b/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_ACQUIRE_UNIQUE_TOKEN_IMPL_HPP
 #define KOKKOS_ACQUIRE_UNIQUE_TOKEN_IMPL_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
index fb94049d7..6eed92be0 100644
--- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_ANONYMOUSSPACE_HPP
 #define KOKKOS_ANONYMOUSSPACE_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp
index d2098d0b1..e7fec4c44 100644
--- a/packages/kokkos/core/src/Kokkos_Array.hpp
+++ b/packages/kokkos/core/src/Kokkos_Array.hpp
@@ -44,15 +44,20 @@
 
 #ifndef KOKKOS_ARRAY_HPP
 #define KOKKOS_ARRAY_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_StringManipulation.hpp>
 
 #include <type_traits>
 #include <algorithm>
+#include <utility>
 #include <limits>
 #include <cstddef>
-#include <string>
 
 namespace Kokkos {
 
@@ -64,14 +69,12 @@ struct ArrayBoundsCheck;
 template <typename Integral>
 struct ArrayBoundsCheck<Integral, true> {
   KOKKOS_INLINE_FUNCTION
-  ArrayBoundsCheck(Integral i, size_t N) {
+  constexpr ArrayBoundsCheck(Integral i, size_t N) {
     if (i < 0) {
-      KOKKOS_IF_ON_HOST((std::string s = "Kokkos::Array: index ";
-                         s += std::to_string(i); s += " < 0";
-                         Kokkos::Impl::throw_runtime_exception(s);))
-
-      KOKKOS_IF_ON_DEVICE(
-          (Kokkos::abort("Kokkos::Array: negative index in device code");))
+      char err[128] = "Kokkos::Array: index ";
+      to_chars_i(err + strlen(err), err + 128, i);
+      strcat(err, " < 0");
+      Kokkos::abort(err);
     }
     ArrayBoundsCheck<Integral, false>(i, N);
   }
@@ -80,14 +83,13 @@ struct ArrayBoundsCheck<Integral, true> {
 template <typename Integral>
 struct ArrayBoundsCheck<Integral, false> {
   KOKKOS_INLINE_FUNCTION
-  ArrayBoundsCheck(Integral i, size_t N) {
+  constexpr ArrayBoundsCheck(Integral i, size_t N) {
     if (size_t(i) >= N) {
-      KOKKOS_IF_ON_HOST((std::string s = "Kokkos::Array: index ";
-                         s += std::to_string(i); s += " >= ";
-                         s += std::to_string(N);
-                         Kokkos::Impl::throw_runtime_exception(s);))
-
-      KOKKOS_IF_ON_DEVICE((Kokkos::abort("Kokkos::Array: index >= size");))
+      char err[128] = "Kokkos::Array: index ";
+      to_chars_i(err + strlen(err), err + 128, i);
+      strcat(err, " >= ");
+      to_chars_i(err + strlen(err), err + 128, N);
+      Kokkos::abort(err);
     }
   }
 };
@@ -118,19 +120,19 @@ struct Array {
 
  public:
   using reference       = T&;
-  using const_reference = typename std::add_const<T>::type&;
+  using const_reference = std::add_const_t<T>&;
   using size_type       = size_t;
   using difference_type = ptrdiff_t;
   using value_type      = T;
   using pointer         = T*;
-  using const_pointer   = typename std::add_const<T>::type*;
+  using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N; }
   KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return false; }
   KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return N; }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) {
+  KOKKOS_INLINE_FUNCTION constexpr reference operator[](const iType& i) {
     static_assert(
         (std::is_integral<iType>::value || std::is_enum<iType>::value),
         "Must be integral argument");
@@ -139,7 +141,8 @@ struct Array {
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const {
+  KOKKOS_INLINE_FUNCTION constexpr const_reference operator[](
+      const iType& i) const {
     static_assert(
         (std::is_integral<iType>::value || std::is_enum<iType>::value),
         "Must be integral argument");
@@ -147,10 +150,10 @@ struct Array {
     return m_internal_implementation_private_member_data[i];
   }
 
-  KOKKOS_INLINE_FUNCTION pointer data() {
+  KOKKOS_INLINE_FUNCTION constexpr pointer data() {
     return &m_internal_implementation_private_member_data[0];
   }
-  KOKKOS_INLINE_FUNCTION const_pointer data() const {
+  KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const {
     return &m_internal_implementation_private_member_data[0];
   }
 };
@@ -159,12 +162,12 @@ template <class T, class Proxy>
 struct Array<T, 0, Proxy> {
  public:
   using reference       = T&;
-  using const_reference = typename std::add_const<T>::type&;
+  using const_reference = std::add_const_t<T>&;
   using size_type       = size_t;
   using difference_type = ptrdiff_t;
   using value_type      = T;
   using pointer         = T*;
-  using const_pointer   = typename std::add_const<T>::type*;
+  using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return 0; }
   KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return true; }
@@ -216,12 +219,12 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> {
 
  public:
   using reference       = T&;
-  using const_reference = typename std::add_const<T>::type&;
+  using const_reference = std::add_const_t<T>&;
   using size_type       = size_t;
   using difference_type = ptrdiff_t;
   using value_type      = T;
   using pointer         = T*;
-  using const_pointer   = typename std::add_const<T>::type*;
+  using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; }
   KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; }
@@ -285,12 +288,12 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> {
 
  public:
   using reference       = T&;
-  using const_reference = typename std::add_const<T>::type&;
+  using const_reference = std::add_const_t<T>&;
   using size_type       = size_t;
   using difference_type = ptrdiff_t;
   using value_type      = T;
   using pointer         = T*;
-  using const_pointer   = typename std::add_const<T>::type*;
+  using const_pointer   = std::add_const_t<T>*;
 
   KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; }
   KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; }
@@ -347,4 +350,54 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> {
 
 }  // namespace Kokkos
 
+//<editor-fold desc="Support for structured binding">
+// guarding against bogus error 'specialization in different namespace' with
+// older GCC that do not support C++17 anyway
+#if !defined(KOKKOS_COMPILER_GNU) || (KOKKOS_COMPILER_GNU >= 710)
+#if defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 800
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <class T, std::size_t N>
+struct std::tuple_size<Kokkos::Array<T, N>>
+    : std::integral_constant<std::size_t, N> {};
+
+template <std::size_t I, class T, std::size_t N>
+struct std::tuple_element<I, Kokkos::Array<T, N>> {
+  using type = T;
+};
+#if defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 800
+#pragma clang diagnostic pop
+#endif
+#endif
+
+namespace Kokkos {
+
+template <std::size_t I, class T, std::size_t N>
+KOKKOS_FUNCTION constexpr T& get(Array<T, N>& a) noexcept {
+  return a[I];
+}
+
+template <std::size_t I, class T, std::size_t N>
+KOKKOS_FUNCTION constexpr T const& get(Array<T, N> const& a) noexcept {
+  return a[I];
+}
+
+template <std::size_t I, class T, std::size_t N>
+KOKKOS_FUNCTION constexpr T&& get(Array<T, N>&& a) noexcept {
+  return std::move(a[I]);
+}
+
+template <std::size_t I, class T, std::size_t N>
+KOKKOS_FUNCTION constexpr T const&& get(Array<T, N> const&& a) noexcept {
+  return std::move(a[I]);
+}
+
+}  // namespace Kokkos
+//</editor-fold>
+
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY
+#endif
 #endif /* #ifndef KOKKOS_ARRAY_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp
index b07b5f2f6..7a2d1c662 100644
--- a/packages/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp
@@ -67,6 +67,10 @@
 
 #ifndef KOKKOS_ATOMIC_HPP
 #define KOKKOS_ATOMIC_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC
+#endif
 
 #include <Kokkos_Macros.hpp>
 
@@ -414,4 +418,8 @@ KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
 }  // namespace Kokkos
 
 #endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC
+#endif
 #endif /* KOKKOS_ATOMIC_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp
index 81ae34b9e..ef576d74c 100644
--- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp
@@ -41,6 +41,15 @@
 // ************************************************************************
 //@HEADER
 */
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_ATOMICS_DESUL_CONFIG_HPP
 #define KOKKOS_ATOMICS_DESUL_CONFIG_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
index d2bc9df89..b202ab8f8 100644
--- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -1,3 +1,13 @@
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
 #define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
 #include <Kokkos_Macros.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
index 939cf950b..ed7e8d9ed 100644
--- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -1,3 +1,13 @@
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
 #define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
 #include <Kokkos_Macros.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
index 466903ab7..009c73c90 100644
--- a/packages/kokkos/core/src/Kokkos_Complex.hpp
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -43,6 +43,10 @@
 */
 #ifndef KOKKOS_COMPLEX_HPP
 #define KOKKOS_COMPLEX_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_COMPLEX
+#endif
 
 #include <Kokkos_Atomic.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
@@ -87,9 +91,9 @@ class
   complex& operator=(const complex&) noexcept = default;
 
   /// \brief Conversion constructor from compatible RType
-  template <class RType,
-            typename std::enable_if<std::is_convertible<RType, RealType>::value,
-                                    int>::type = 0>
+  template <
+      class RType,
+      std::enable_if_t<std::is_convertible<RType, RealType>::value, int> = 0>
   KOKKOS_INLINE_FUNCTION complex(const complex<RType>& other) noexcept
       // Intentionally do the conversions implicitly here so that users don't
       // get any warnings about narrowing, etc., that they would expect to get
@@ -217,7 +221,6 @@ class
   // Conditional noexcept, just in case RType throws on divide-by-zero
   constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
-    using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
@@ -245,7 +248,6 @@ class
   constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
-    using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
@@ -282,9 +284,9 @@ class
   //---------------------------------------------------------------------------
 
   //! Copy constructor from volatile.
-  template <class RType,
-            typename std::enable_if<std::is_convertible<RType, RealType>::value,
-                                    int>::type = 0>
+  template <
+      class RType,
+      std::enable_if_t<std::is_convertible<RType, RealType>::value, int> = 0>
   KOKKOS_INLINE_FUNCTION complex(const volatile complex<RType>& src) noexcept
       // Intentionally do the conversions implicitly here so that users don't
       // get any warnings about narrowing, etc., that they would expect to get
@@ -312,8 +314,7 @@ class
   //    vl = r;
   //    vl = cr;
   template <class Complex,
-            typename std::enable_if<std::is_same<Complex, complex>::value,
-                                    int>::type = 0>
+            std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0>
   KOKKOS_INLINE_FUNCTION void operator=(const Complex& src) volatile noexcept {
     re_ = src.re_;
     im_ = src.im_;
@@ -335,8 +336,7 @@ class
   //    vl = vr;
   //    vl = cvr;
   template <class Complex,
-            typename std::enable_if<std::is_same<Complex, complex>::value,
-                                    int>::type = 0>
+            std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0>
   KOKKOS_INLINE_FUNCTION volatile complex& operator=(
       const volatile Complex& src) volatile noexcept {
     re_ = src.re_;
@@ -358,8 +358,7 @@ class
   //    l = cvr;
   //
   template <class Complex,
-            typename std::enable_if<std::is_same<Complex, complex>::value,
-                                    int>::type = 0>
+            std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0>
   KOKKOS_INLINE_FUNCTION complex& operator=(
       const volatile Complex& src) noexcept {
     re_ = src.re_;
@@ -451,7 +450,7 @@ class
 template <class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x,
                                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) == common_type(y.real()) &&
          common_type(x.imag()) == common_type(y.imag());
 }
@@ -462,7 +461,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x,
 template <class RealType1, class RealType2>
 inline bool operator==(std::complex<RealType1> const& x,
                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) == common_type(y.real()) &&
          common_type(x.imag()) == common_type(y.imag());
 }
@@ -471,7 +470,7 @@ inline bool operator==(std::complex<RealType1> const& x,
 template <class RealType1, class RealType2>
 inline bool operator==(complex<RealType1> const& x,
                        std::complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) == common_type(y.real()) &&
          common_type(x.imag()) == common_type(y.imag());
 }
@@ -480,11 +479,10 @@ inline bool operator==(complex<RealType1> const& x,
 template <
     class RealType1, class RealType2,
     // Constraints to avoid participation in oparator==() for every possible RHS
-    typename std::enable_if<std::is_convertible<RealType2, RealType1>::value,
-                            int>::type = 0>
+    std::enable_if_t<std::is_convertible<RealType2, RealType1>::value, int> = 0>
 KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x,
                                        RealType2 const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) == common_type(y) &&
          common_type(x.imag()) == common_type(0);
 }
@@ -493,11 +491,10 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x,
 template <
     class RealType1, class RealType2,
     // Constraints to avoid participation in oparator==() for every possible RHS
-    typename std::enable_if<std::is_convertible<RealType1, RealType2>::value,
-                            int>::type = 0>
+    std::enable_if_t<std::is_convertible<RealType1, RealType2>::value, int> = 0>
 KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x,
                                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x) == common_type(y.real()) &&
          common_type(0) == common_type(y.imag());
 }
@@ -506,7 +503,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x,
 template <class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x,
                                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) != common_type(y.real()) ||
          common_type(x.imag()) != common_type(y.imag());
 }
@@ -515,7 +512,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x,
 template <class RealType1, class RealType2>
 inline bool operator!=(std::complex<RealType1> const& x,
                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) != common_type(y.real()) ||
          common_type(x.imag()) != common_type(y.imag());
 }
@@ -524,7 +521,7 @@ inline bool operator!=(std::complex<RealType1> const& x,
 template <class RealType1, class RealType2>
 inline bool operator!=(complex<RealType1> const& x,
                        std::complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) != common_type(y.real()) ||
          common_type(x.imag()) != common_type(y.imag());
 }
@@ -533,11 +530,10 @@ inline bool operator!=(complex<RealType1> const& x,
 template <
     class RealType1, class RealType2,
     // Constraints to avoid participation in oparator==() for every possible RHS
-    typename std::enable_if<std::is_convertible<RealType2, RealType1>::value,
-                            int>::type = 0>
+    std::enable_if_t<std::is_convertible<RealType2, RealType1>::value, int> = 0>
 KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x,
                                        RealType2 const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x.real()) != common_type(y) ||
          common_type(x.imag()) != common_type(0);
 }
@@ -546,11 +542,10 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x,
 template <
     class RealType1, class RealType2,
     // Constraints to avoid participation in oparator==() for every possible RHS
-    typename std::enable_if<std::is_convertible<RealType1, RealType2>::value,
-                            int>::type = 0>
+    std::enable_if_t<std::is_convertible<RealType1, RealType2>::value, int> = 0>
 KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x,
                                        complex<RealType2> const& y) noexcept {
-  using common_type = typename std::common_type<RealType1, RealType2>::type;
+  using common_type = std::common_type_t<RealType1, RealType2>;
   return common_type(x) != common_type(y.real()) ||
          common_type(0) != common_type(y.imag());
 }
@@ -560,30 +555,26 @@ KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x,
 
 //! Binary + operator for complex complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator+(const complex<RealType1>& x,
-              const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x.real() + y.real(), x.imag() + y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator+(const complex<RealType1>& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x.real() + y.real(),
+                                                           x.imag() + y.imag());
 }
 
 //! Binary + operator for complex scalar.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator+(const complex<RealType1>& x, const RealType2& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x.real() + y, x.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator+(const complex<RealType1>& x, const RealType2& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x.real() + y,
+                                                           x.imag());
 }
 
 //! Binary + operator for scalar complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator+(const RealType1& x, const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x + y.real(), y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator+(const RealType1& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x + y.real(),
+                                                           y.imag());
 }
 
 //! Unary + operator for complex.
@@ -595,30 +586,26 @@ KOKKOS_INLINE_FUNCTION complex<RealType> operator+(
 
 //! Binary - operator for complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator-(const complex<RealType1>& x,
-              const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x.real() - y.real(), x.imag() - y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator-(const complex<RealType1>& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x.real() - y.real(),
+                                                           x.imag() - y.imag());
 }
 
 //! Binary - operator for complex scalar.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator-(const complex<RealType1>& x, const RealType2& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x.real() - y, x.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator-(const complex<RealType1>& x, const RealType2& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x.real() - y,
+                                                           x.imag());
 }
 
 //! Binary - operator for scalar complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator-(const RealType1& x, const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x - y.real(), -y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator-(const RealType1& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x - y.real(),
+                                                           -y.imag());
 }
 
 //! Unary - operator for complex.
@@ -630,11 +617,9 @@ KOKKOS_INLINE_FUNCTION complex<RealType> operator-(
 
 //! Binary * operator for complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator*(const complex<RealType1>& x,
-              const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator*(const complex<RealType1>& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(
       x.real() * y.real() - x.imag() * y.imag(),
       x.real() * y.imag() + x.imag() * y.real());
 }
@@ -648,9 +633,9 @@ KOKKOS_INLINE_FUNCTION
 /// std::complex's methods and nonmember functions are not marked as
 /// CUDA device functions.
 template <class RealType1, class RealType2>
-inline complex<typename std::common_type<RealType1, RealType2>::type> operator*(
+inline complex<std::common_type_t<RealType1, RealType2>> operator*(
     const std::complex<RealType1>& x, const complex<RealType2>& y) {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
+  return complex<std::common_type_t<RealType1, RealType2>>(
       x.real() * y.real() - x.imag() * y.imag(),
       x.real() * y.imag() + x.imag() * y.real());
 }
@@ -660,11 +645,10 @@ inline complex<typename std::common_type<RealType1, RealType2>::type> operator*(
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator*(const RealType1& x, const complex<RealType2>& y) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x * y.real(), x * y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator*(const RealType1& x, const complex<RealType2>& y) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x * y.real(),
+                                                           x * y.imag());
 }
 
 /// \brief Binary * operator for RealType times complex.
@@ -672,11 +656,10 @@ KOKKOS_INLINE_FUNCTION
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator*(const complex<RealType1>& y, const RealType2& x) noexcept {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      x * y.real(), x * y.imag());
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator*(const complex<RealType1>& y, const RealType2& x) noexcept {
+  return complex<std::common_type_t<RealType1, RealType2>>(x * y.real(),
+                                                           x * y.imag());
 }
 
 //! Imaginary part of a complex number.
@@ -706,8 +689,6 @@ KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> real(
 //! Constructs a complex number from magnitude and phase angle
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::sin;
   KOKKOS_EXPECTS(r >= 0);
   return complex<T>(r * cos(theta), r * sin(theta));
 }
@@ -715,15 +696,12 @@ KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
 //! Absolute value (magnitude) of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType abs(const complex<RealType>& x) {
-  using Kokkos::Experimental::hypot;
   return hypot(x.real(), x.imag());
 }
 
 //! Power of a complex number
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x, const T& y) {
-  using Kokkos::Experimental::atan2;
-  using Kokkos::Experimental::pow;
   T r     = abs(x);
   T theta = atan2(x.imag(), x.real());
   return polar(pow(r, y), y * theta);
@@ -737,8 +715,6 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const T& x, const complex<T>& y) {
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
                                       const complex<T>& y) {
-  using Kokkos::Experimental::log;
-
   return x == T() ? T() : exp(y * log(x));
 }
 
@@ -770,9 +746,6 @@ KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sqrt(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::fabs;
-  using Kokkos::Experimental::sqrt;
-
   RealType r = x.real();
   RealType i = x.imag();
 
@@ -805,9 +778,6 @@ KOKKOS_INLINE_FUNCTION constexpr complex<Impl::promote_t<ArithmeticType>> conj(
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::exp;
-  using Kokkos::Experimental::sin;
   return exp(x.real()) * complex<RealType>(cos(x.imag()), sin(x.imag()));
 }
 
@@ -815,20 +785,21 @@ KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> log(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::atan2;
-  using Kokkos::Experimental::log;
   RealType phi = atan2(x.imag(), x.real());
   return Kokkos::complex<RealType>(log(abs(x)), phi);
 }
 
+//! base 10 log of a complex number.
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> log10(
+    const complex<RealType>& x) {
+  return log(x) / log(RealType(10));
+}
+
 //! sine of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sin(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::cosh;
-  using Kokkos::Experimental::sin;
-  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(sin(x.real()) * cosh(x.imag()),
                                    cos(x.real()) * sinh(x.imag()));
 }
@@ -837,10 +808,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sin(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cos(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::cosh;
-  using Kokkos::Experimental::sin;
-  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(cos(x.real()) * cosh(x.imag()),
                                    -sin(x.real()) * sinh(x.imag()));
 }
@@ -856,10 +823,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> tan(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sinh(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::cosh;
-  using Kokkos::Experimental::sin;
-  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(sinh(x.real()) * cos(x.imag()),
                                    cosh(x.real()) * sin(x.imag()));
 }
@@ -868,10 +831,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sinh(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cosh(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::cos;
-  using Kokkos::Experimental::cosh;
-  using Kokkos::Experimental::sin;
-  using Kokkos::Experimental::sinh;
   return Kokkos::complex<RealType>(cosh(x.real()) * cos(x.imag()),
                                    sinh(x.real()) * sin(x.imag()));
 }
@@ -902,9 +861,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acosh(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atanh(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::atan2;
-  using Kokkos::Experimental::log;
-
   const RealType i2 = x.imag() * x.imag();
   const RealType r  = RealType(1.0) - i2 - x.real() * x.real();
 
@@ -932,7 +888,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> asin(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acos(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::acos;
   Kokkos::complex<RealType> t = asin(x);
   RealType pi_2               = acos(RealType(0.0));
   return Kokkos::complex<RealType>(pi_2 - t.real(), -t.imag());
@@ -942,8 +897,6 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acos(
 template <class RealType>
 KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atan(
     const complex<RealType>& x) {
-  using Kokkos::Experimental::atan2;
-  using Kokkos::Experimental::log;
   const RealType r2 = x.real() * x.real();
   const RealType i  = RealType(1.0) - r2 - x.imag() * x.imag();
 
@@ -969,28 +922,23 @@ inline complex<RealType> exp(const std::complex<RealType>& c) {
 
 //! Binary operator / for complex and real numbers
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator/(const complex<RealType1>& x,
-              const RealType2& y) noexcept(noexcept(RealType1{} /
-                                                    RealType2{})) {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(
-      real(x) / y, imag(x) / y);
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator/(const complex<RealType1>& x,
+          const RealType2& y) noexcept(noexcept(RealType1{} / RealType2{})) {
+  return complex<std::common_type_t<RealType1, RealType2>>(real(x) / y,
+                                                           imag(x) / y);
 }
 
 //! Binary operator / for complex.
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator/(const complex<RealType1>& x,
-              const complex<RealType2>& y) noexcept(noexcept(RealType1{} /
-                                                             RealType2{})) {
-  using Kokkos::Experimental::fabs;
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator/(const complex<RealType1>& x,
+          const complex<RealType2>& y) noexcept(noexcept(RealType1{} /
+                                                         RealType2{})) {
   // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
   // If the real part is +/-Inf and the imaginary part is -/+Inf,
   // this won't change the result.
-  using common_real_type =
-      typename std::common_type<RealType1, RealType2>::type;
+  using common_real_type   = std::common_type_t<RealType1, RealType2>;
   const common_real_type s = fabs(real(y)) + fabs(imag(y));
 
   // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
@@ -1012,12 +960,11 @@ KOKKOS_INLINE_FUNCTION
 
 //! Binary operator / for complex and real numbers
 template <class RealType1, class RealType2>
-KOKKOS_INLINE_FUNCTION
-    complex<typename std::common_type<RealType1, RealType2>::type>
-    operator/(const RealType1& x,
-              const complex<RealType2>& y) noexcept(noexcept(RealType1{} /
-                                                             RealType2{})) {
-  return complex<typename std::common_type<RealType1, RealType2>::type>(x) / y;
+KOKKOS_INLINE_FUNCTION complex<std::common_type_t<RealType1, RealType2>>
+operator/(const RealType1& x,
+          const complex<RealType2>& y) noexcept(noexcept(RealType1{} /
+                                                         RealType2{})) {
+  return complex<std::common_type_t<RealType1, RealType2>>(x) / y;
 }
 
 template <class RealType>
@@ -1050,4 +997,8 @@ struct reduction_identity<Kokkos::complex<T>> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_COMPLEX
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_COMPLEX
+#endif
 #endif  // KOKKOS_COMPLEX_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp
index 5a1a571e4..63f2b896b 100644
--- a/packages/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_CORE_CONCEPTS_HPP
 #define KOKKOS_CORE_CONCEPTS_HPP
 
@@ -174,7 +183,8 @@ KOKKOS_IMPL_IS_CONCEPT(array_layout)
 KOKKOS_IMPL_IS_CONCEPT(reducer)
 namespace Experimental {
 KOKKOS_IMPL_IS_CONCEPT(work_item_property)
-}
+KOKKOS_IMPL_IS_CONCEPT(hooks_policy)
+}  // namespace Experimental
 
 namespace Impl {
 
@@ -269,8 +279,7 @@ struct is_device_helper<Device<ExecutionSpace, MemorySpace>> : std::true_type {
 }  // namespace Impl
 
 template <typename T>
-using is_device =
-    typename Impl::is_device_helper<typename std::remove_cv<T>::type>::type;
+using is_device = typename Impl::is_device_helper<std::remove_cv_t<T>>::type;
 
 //----------------------------------------------------------------------------
 
@@ -293,32 +302,26 @@ struct is_space {
   };
 
   template <typename U>
-  struct exe<U, typename std::conditional<true, void,
-                                          typename U::execution_space>::type>
+  struct exe<U, std::conditional_t<true, void, typename U::execution_space>>
       : std::is_same<U, typename U::execution_space>::type {
     using space = typename U::execution_space;
   };
 
   template <typename U>
-  struct mem<
-      U, typename std::conditional<true, void, typename U::memory_space>::type>
+  struct mem<U, std::conditional_t<true, void, typename U::memory_space>>
       : std::is_same<U, typename U::memory_space>::type {
     using space = typename U::memory_space;
   };
 
   template <typename U>
-  struct dev<
-      U, typename std::conditional<true, void, typename U::device_type>::type>
+  struct dev<U, std::conditional_t<true, void, typename U::device_type>>
       : std::is_same<U, typename U::device_type>::type {
     using space = typename U::device_type;
   };
 
-  using is_exe =
-      typename is_space<T>::template exe<typename std::remove_cv<T>::type>;
-  using is_mem =
-      typename is_space<T>::template mem<typename std::remove_cv<T>::type>;
-  using is_dev =
-      typename is_space<T>::template dev<typename std::remove_cv<T>::type>;
+  using is_exe = typename is_space<T>::template exe<std::remove_cv_t<T>>;
+  using is_mem = typename is_space<T>::template mem<std::remove_cv_t<T>>;
+  using is_dev = typename is_space<T>::template dev<std::remove_cv_t<T>>;
 
  public:
   static constexpr bool value = is_exe::value || is_mem::value || is_dev::value;
@@ -342,7 +345,9 @@ struct is_space {
           std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value
 #elif defined(KOKKOS_ENABLE_HIP)
           || std::is_same<memory_space,
-                          Kokkos::Experimental::HIPHostPinnedSpace>::value
+                          Kokkos::Experimental::HIPHostPinnedSpace>::value ||
+          std::is_same<memory_space,
+                       Kokkos::Experimental::HIPManagedSpace>::value
 #elif defined(KOKKOS_ENABLE_SYCL)
           || std::is_same<memory_space,
                           Kokkos::Experimental::SYCLSharedUSMSpace>::value ||
@@ -500,11 +505,11 @@ struct SpaceAccessibility {
   // to be able to access MemorySpace?
   // If same memory space or not accessible use the AccessSpace
   // else construct a device with execution space and memory space.
-  using space = typename std::conditional<
+  using space = std::conditional_t<
       std::is_same<typename AccessSpace::memory_space, MemorySpace>::value ||
           !exe_access::accessible,
       AccessSpace,
-      Kokkos::Device<typename AccessSpace::execution_space, MemorySpace>>::type;
+      Kokkos::Device<typename AccessSpace::execution_space, MemorySpace>>;
 };
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
index ac516e31e..0a66ee9da 100644
--- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_COPYVIEWS_HPP_
 #define KOKKOS_COPYVIEWS_HPP_
 #include <string>
@@ -862,14 +871,20 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 1> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
-      view_copy(dst, src);
+      view_copy(exec_space..., dst, src);
     } else {
       p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
       using sv_adapter_type = CommonSubview<DstType, SrcType, 1, p_type>;
       sv_adapter_type common_subview(dst, src, ext0);
-      view_copy(common_subview.dst_sub, common_subview.src_sub);
+      view_copy(exec_space..., common_subview.dst_sub, common_subview.src_sub);
     }
   }
 };
@@ -878,16 +893,23 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 2> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(1) == src.extent(1)) {
-        view_copy(dst, src);
+        view_copy(exec_space..., dst, src);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 2, Kokkos::Impl::ALL_t, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(1) == src.extent(1)) {
@@ -895,14 +917,16 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 2> {
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 2, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 2, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -912,7 +936,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(2) == src.extent(2)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -921,7 +951,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
                           Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1,
                                        Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -929,7 +960,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
             CommonSubview<DstType, SrcType, 3, Kokkos::Impl::ALL_t, p_type,
                           p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(2) == src.extent(2)) {
@@ -938,7 +970,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
         using sv_adapter_type = CommonSubview<DstType, SrcType, 3, p_type,
                                               p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -946,7 +979,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 3> {
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 3, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -956,7 +990,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(3) == src.extent(3)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -966,7 +1006,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
                           p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2,
                                        Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -975,7 +1016,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
             CommonSubview<DstType, SrcType, 4, Kokkos::Impl::ALL_t, p_type,
                           p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(7) == src.extent(7)) {
@@ -986,7 +1028,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
             CommonSubview<DstType, SrcType, 4, p_type, p_type, p_type,
                           Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -995,7 +1038,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 4> {
         using sv_adapter_type =
             CommonSubview<DstType, SrcType, 4, p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -1005,7 +1049,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(4) == src.extent(4)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1016,7 +1066,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
                           p_type, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -1027,7 +1078,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
                           p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(4) == src.extent(4)) {
@@ -1040,7 +1092,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
                           Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3,
                                        Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1050,7 +1103,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 5> {
         using sv_adapter_type = CommonSubview<DstType, SrcType, 5, p_type,
                                               p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -1059,7 +1113,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(5) == src.extent(5)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1071,7 +1131,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
                           p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -1083,7 +1144,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
                           p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(5) == src.extent(5)) {
@@ -1098,7 +1160,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
                           p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1112,7 +1175,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 6> {
                           p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -1122,7 +1186,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(6) == src.extent(6)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1135,7 +1205,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
                           p_type, p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -1148,7 +1219,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
                           p_type, p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(6) == src.extent(6)) {
@@ -1163,7 +1235,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
                           p_type, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1177,7 +1250,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 7> {
                           p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, ext6);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -1187,7 +1261,13 @@ template <class DstType, class SrcType, class ExecSpace>
 struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
   using p_type = Kokkos::pair<int64_t, int64_t>;
 
-  ViewRemap(const DstType& dst, const SrcType& src) {
+  template <typename... OptExecSpace>
+  ViewRemap(const DstType& dst, const SrcType& src,
+            const OptExecSpace&... exec_space) {
+    static_assert(
+        sizeof...(OptExecSpace) <= 1,
+        "OptExecSpace must be either empty or be an execution space!");
+
     if (dst.extent(0) == src.extent(0)) {
       if (dst.extent(7) == src.extent(7)) {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1202,7 +1282,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
                           Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
         p_type ext2(0, std::min(dst.extent(2), src.extent(2)));
@@ -1216,7 +1297,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
                           p_type, p_type, p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3,
                                        ext4, ext5, ext6, ext7);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     } else {
       if (dst.extent(7) == src.extent(7)) {
@@ -1232,7 +1314,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
                           p_type, p_type, p_type, Kokkos::Impl::ALL_t>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, ext6, Kokkos::ALL);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       } else {
         p_type ext0(0, std::min(dst.extent(0), src.extent(0)));
         p_type ext1(0, std::min(dst.extent(1), src.extent(1)));
@@ -1247,7 +1330,8 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
                           p_type, p_type, p_type, p_type>;
         sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4,
                                        ext5, ext6, ext7);
-        view_copy(common_subview.dst_sub, common_subview.src_sub);
+        view_copy(exec_space..., common_subview.dst_sub,
+                  common_subview.src_sub);
       }
     }
   }
@@ -1261,9 +1345,9 @@ inline void contiguous_fill(
   using ViewTypeFlat = Kokkos::View<
       typename ViewType::value_type*, Kokkos::LayoutRight,
       Kokkos::Device<typename ViewType::execution_space,
-                     typename std::conditional<ViewType::Rank == 0,
-                                               typename ViewType::memory_space,
-                                               Kokkos::AnonymousSpace>::type>,
+                     std::conditional_t<ViewType::Rank == 0,
+                                        typename ViewType::memory_space,
+                                        Kokkos::AnonymousSpace>>,
       Kokkos::MemoryTraits<0>>;
 
   ViewTypeFlat dst_flat(dst.data(), dst.size());
@@ -1292,23 +1376,27 @@ struct ZeroMemset {
 
 template <typename ExecutionSpace, class DT, class... DP>
 inline std::enable_if_t<
-    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value &&
     std::is_trivially_copy_assignable<
-        typename ViewTraits<DT, DP...>::const_value_type>::value>
+        typename ViewTraits<DT, DP...>::value_type>::value>
 contiguous_fill_or_memset(
     const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value) {
+// On A64FX memset seems to do the wrong thing with regards to first touch
+// leading to the significant performance issues
+#ifndef KOKKOS_ARCH_A64FX
   if (Impl::is_zero_byte(value))
     ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
   else
+#endif
     contiguous_fill(exec_space, dst, value);
 }
 
 template <typename ExecutionSpace, class DT, class... DP>
-inline std::enable_if_t<!(
-    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
-    std::is_trivially_copy_assignable<
-        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+inline std::enable_if_t<
+    !(std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value &&
+      std::is_trivially_copy_assignable<
+          typename ViewTraits<DT, DP...>::value_type>::value)>
 contiguous_fill_or_memset(
     const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value) {
@@ -1317,26 +1405,30 @@ contiguous_fill_or_memset(
 
 template <class DT, class... DP>
 inline std::enable_if_t<
-    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value &&
     std::is_trivially_copy_assignable<
-        typename ViewTraits<DT, DP...>::const_value_type>::value>
+        typename ViewTraits<DT, DP...>::value_type>::value>
 contiguous_fill_or_memset(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value) {
   using ViewType        = View<DT, DP...>;
   using exec_space_type = typename ViewType::execution_space;
 
+// On A64FX memset seems to do the wrong thing with regards to first touch
+// leading to the significant performance issues
+#ifndef KOKKOS_ARCH_A64FX
   if (Impl::is_zero_byte(value))
     ZeroMemset<exec_space_type, DT, DP...>(dst, value);
   else
+#endif
     contiguous_fill(exec_space_type(), dst, value);
 }
 
 template <class DT, class... DP>
-inline std::enable_if_t<!(
-    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
-    std::is_trivially_copy_assignable<
-        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+inline std::enable_if_t<
+    !(std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value &&
+      std::is_trivially_copy_assignable<
+          typename ViewTraits<DT, DP...>::value_type>::value)>
 contiguous_fill_or_memset(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value) {
@@ -1352,9 +1444,8 @@ template <class DT, class... DP>
 inline void deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   using ViewType        = View<DT, DP...>;
   using exec_space_type = typename ViewType::execution_space;
 
@@ -1416,9 +1507,10 @@ inline void deep_copy(
 
   // Lets call the right ViewFill functor based on integer space needed and
   // iteration type
-  using ViewTypeUniform = typename std::conditional<
-      ViewType::Rank == 0, typename ViewType::uniform_runtime_type,
-      typename ViewType::uniform_runtime_nomemspace_type>::type;
+  using ViewTypeUniform =
+      std::conditional_t<ViewType::Rank == 0,
+                         typename ViewType::uniform_runtime_type,
+                         typename ViewType::uniform_runtime_nomemspace_type>;
   if (dst.span() > static_cast<size_t>(std::numeric_limits<int>::max())) {
     if (iterate == Kokkos::Iterate::Right)
       Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight,
@@ -1450,9 +1542,8 @@ template <class ST, class... SP>
 inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<ST, SP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize,
+                                  void>::value>* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
 
@@ -1487,12 +1578,11 @@ inline void deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
-        (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
-         unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>::type* =
-        nullptr) {
+    std::enable_if_t<
+        (std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+         std::is_void<typename ViewTraits<ST, SP...>::specialize>::value &&
+         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
+          unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>* = nullptr) {
   using dst_type = View<DT, DP...>;
   using src_type = View<ST, SP...>;
 
@@ -1540,11 +1630,11 @@ inline void deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
-        (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
-         unsigned(ViewTraits<ST, SP...>::rank) != 0))>::type* = nullptr) {
+    std::enable_if_t<
+        (std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+         std::is_void<typename ViewTraits<ST, SP...>::specialize>::value &&
+         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
+          unsigned(ViewTraits<ST, SP...>::rank) != 0))>* = nullptr) {
   using dst_type            = View<DT, DP...>;
   using src_type            = View<ST, SP...>;
   using dst_execution_space = typename dst_type::execution_space;
@@ -1748,9 +1838,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 1 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 1)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 1 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 1)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1767,9 +1856,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 2 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 2)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 2 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 2)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1795,9 +1883,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 3 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 3)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 3 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 3)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1825,9 +1912,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 4 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 4)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 4 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 4)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1858,9 +1944,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 5 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 5)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 5 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 5)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1893,9 +1978,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 6 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 6)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 6 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 6)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1930,9 +2014,8 @@ template <class TeamType, class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 7 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 7)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 7 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 7)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1969,9 +2052,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 1 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 1)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 1 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 1)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -1986,9 +2068,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 2 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 2)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 2 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 2)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2004,9 +2085,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 3 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 3)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 3 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 3)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2024,9 +2104,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 4 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 4)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 4 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 4)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2045,9 +2124,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 5 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 5)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 5 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 5)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2067,9 +2145,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 6 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 6)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 6 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 6)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2090,9 +2167,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class DT, class... DP, class ST, class... SP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst, const View<ST, SP...>& src,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 7 &&
-                             unsigned(ViewTraits<ST, SP...>::rank) ==
-                                 7)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 7 &&
+                      unsigned(ViewTraits<ST, SP...>::rank) == 7)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2118,9 +2194,8 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()),
                        [&](const int& i) { dst.data()[i] = value; });
 }
@@ -2129,9 +2204,8 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<std::is_same<
-        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
-        nullptr) {
+    std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize,
+                                  void>::value>* = nullptr) {
   for (size_t i = 0; i < dst.span(); ++i) {
     dst.data()[i] = value;
   }
@@ -2141,8 +2215,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             1)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 1)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2159,8 +2232,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             2)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 2)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2186,8 +2258,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             3)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 3)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2215,8 +2286,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             4)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 4)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2247,8 +2317,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             5)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 5)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2281,8 +2350,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             6)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 6)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2317,8 +2385,7 @@ template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const TeamType& team, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             7)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 7)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2356,8 +2423,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             1)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 1)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2373,8 +2439,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             2)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 2)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2391,8 +2456,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             3)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 3)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2410,8 +2474,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             4)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 4)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2431,8 +2494,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             5)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 5)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2453,8 +2515,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             6)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 6)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2476,8 +2537,7 @@ template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy(
     const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) ==
-                             7)>::type* = nullptr) {
+    std::enable_if_t<(unsigned(ViewTraits<DT, DP...>::rank) == 7)>* = nullptr) {
   if (dst.data() == nullptr) {
     return;
   }
@@ -2509,12 +2569,11 @@ template <class ExecSpace, class DT, class... DP>
 inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<
+    std::enable_if_t<
         Kokkos::is_execution_space<ExecSpace>::value &&
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        Kokkos::SpaceAccessibility<
-            ExecSpace,
-            typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
+        std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+        Kokkos::SpaceAccessibility<ExecSpace, typename ViewTraits<DT, DP...>::
+                                                  memory_space>::accessible>* =
         nullptr) {
   using dst_traits = ViewTraits<DT, DP...>;
   static_assert(std::is_same<typename dst_traits::non_const_value_type,
@@ -2533,12 +2592,52 @@ inline void deep_copy(
   } else if (dst.span_is_contiguous()) {
     Impl::contiguous_fill_or_memset(space, dst, value);
   } else {
-    using ViewTypeUniform = typename std::conditional<
-        View<DT, DP...>::Rank == 0,
-        typename View<DT, DP...>::uniform_runtime_type,
-        typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
-    Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
-                           ExecSpace>(dst, value, space);
+    using ViewType = View<DT, DP...>;
+    // Figure out iteration order to do the ViewFill
+    int64_t strides[ViewType::Rank + 1];
+    dst.stride(strides);
+    Kokkos::Iterate iterate;
+    if (std::is_same<typename ViewType::array_layout,
+                     Kokkos::LayoutRight>::value) {
+      iterate = Kokkos::Iterate::Right;
+    } else if (std::is_same<typename ViewType::array_layout,
+                            Kokkos::LayoutLeft>::value) {
+      iterate = Kokkos::Iterate::Left;
+    } else if (std::is_same<typename ViewType::array_layout,
+                            Kokkos::LayoutStride>::value) {
+      if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0])
+        iterate = Kokkos::Iterate::Right;
+      else
+        iterate = Kokkos::Iterate::Left;
+    } else {
+      if (std::is_same<typename ViewType::execution_space::array_layout,
+                       Kokkos::LayoutRight>::value)
+        iterate = Kokkos::Iterate::Right;
+      else
+        iterate = Kokkos::Iterate::Left;
+    }
+
+    // Lets call the right ViewFill functor based on integer space needed and
+    // iteration type
+    using ViewTypeUniform =
+        std::conditional_t<ViewType::Rank == 0,
+                           typename ViewType::uniform_runtime_type,
+                           typename ViewType::uniform_runtime_nomemspace_type>;
+    if (dst.span() > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+      if (iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, ExecSpace,
+                               ViewType::Rank, int64_t>(dst, value, space);
+      else
+        Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, ExecSpace,
+                               ViewType::Rank, int64_t>(dst, value, space);
+    } else {
+      if (iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, ExecSpace,
+                               ViewType::Rank, int32_t>(dst, value, space);
+      else
+        Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, ExecSpace,
+                               ViewType::Rank, int32_t>(dst, value, space);
+    }
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2551,12 +2650,11 @@ template <class ExecSpace, class DT, class... DP>
 inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
-    typename std::enable_if<
+    std::enable_if_t<
         Kokkos::is_execution_space<ExecSpace>::value &&
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        !Kokkos::SpaceAccessibility<
-            ExecSpace,
-            typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
+        std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+        !Kokkos::SpaceAccessibility<ExecSpace, typename ViewTraits<DT, DP...>::
+                                                   memory_space>::accessible>* =
         nullptr) {
   using dst_traits = ViewTraits<DT, DP...>;
   static_assert(std::is_same<typename dst_traits::non_const_value_type,
@@ -2579,10 +2677,10 @@ inline void deep_copy(
     if (dst.span_is_contiguous()) {
       Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
     } else {
-      using ViewTypeUniform = typename std::conditional<
+      using ViewTypeUniform = std::conditional_t<
           View<DT, DP...>::Rank == 0,
           typename View<DT, DP...>::uniform_runtime_type,
-          typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+          typename View<DT, DP...>::uniform_runtime_nomemspace_type>;
       Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
                              fill_exec_space>(dst, value, fill_exec_space());
     }
@@ -2600,10 +2698,9 @@ inline void deep_copy(
     const ExecSpace& exec_space,
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<
-        Kokkos::is_execution_space<ExecSpace>::value &&
-        std::is_same<typename ViewTraits<ST, SP...>::specialize,
-                     void>::value>::type* = nullptr) {
+    std::enable_if_t<Kokkos::is_execution_space<ExecSpace>::value &&
+                     std::is_same<typename ViewTraits<ST, SP...>::specialize,
+                                  void>::value>* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
   static_assert(src_traits::rank == 0,
@@ -2638,13 +2735,12 @@ template <class ExecSpace, class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(
-        Kokkos::is_execution_space<ExecSpace>::value &&
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
-        (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
-         unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>::type* =
-        nullptr) {
+    std::enable_if_t<
+        (Kokkos::is_execution_space<ExecSpace>::value &&
+         std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+         std::is_void<typename ViewTraits<ST, SP...>::specialize>::value &&
+         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
+          unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>* = nullptr) {
   using src_traits = ViewTraits<ST, SP...>;
   using dst_traits = ViewTraits<DT, DP...>;
 
@@ -2689,12 +2785,12 @@ template <class ExecSpace, class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
-    typename std::enable_if<(
-        Kokkos::is_execution_space<ExecSpace>::value &&
-        std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
-        (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
-         unsigned(ViewTraits<ST, SP...>::rank) != 0))>::type* = nullptr) {
+    std::enable_if_t<
+        (Kokkos::is_execution_space<ExecSpace>::value &&
+         std::is_void<typename ViewTraits<DT, DP...>::specialize>::value &&
+         std::is_void<typename ViewTraits<ST, SP...>::specialize>::value &&
+         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
+          unsigned(ViewTraits<ST, SP...>::rank) != 0))>* = nullptr) {
   using dst_type = View<DT, DP...>;
   using src_type = View<ST, SP...>;
 
@@ -2855,8 +2951,8 @@ inline void deep_copy(
       Impl::view_copy(exec_space, dst, src);
     } else if (DstExecCanAccessSrc || SrcExecCanAccessDst) {
       using cpy_exec_space =
-          typename std::conditional<DstExecCanAccessSrc, dst_execution_space,
-                                    src_execution_space>::type;
+          std::conditional_t<DstExecCanAccessSrc, dst_execution_space,
+                             src_execution_space>;
       exec_space.fence(
           "Kokkos::deep_copy: view-to-view noncontiguous copy on space, pre "
           "copy");
@@ -2900,19 +2996,30 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent,
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
-template <class... I, class T, class... P>
+template <class T, class... P, class... ViewCtorArgs>
 inline typename std::enable_if<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutRight>::value>::type
-impl_resize(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1,
+impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+            Kokkos::View<T, P...>& v, const size_t n0, const size_t n1,
             const size_t n2, const size_t n3, const size_t n4, const size_t n5,
-            const size_t n6, const size_t n7, const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+            const size_t n6, const size_t n7) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::resize "
+                "must not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a memory space instance!");
 
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
@@ -2925,22 +3032,57 @@ impl_resize(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1,
   const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
   if (sizeMismatch) {
-    view_type v_resized(view_alloc(v.label(), arg_prop...), n0, n1, n2, n3, n4,
-                        n5, n6, n7);
-
-    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
-    Kokkos::fence("Kokkos::resize(View)");
+    // Add execution space here to avoid the need for if constexpr below
+    using alloc_prop = Impl::ViewCtorProp<
+        ViewCtorArgs..., std::string,
+        std::conditional_t<alloc_prop_input::has_execution_space,
+                           std::integral_constant<unsigned int, 10>,
+                           typename view_type::execution_space>>;
+    alloc_prop prop_copy(arg_prop);
+    static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+        v.label();
+
+    view_type v_resized(prop_copy, n0, n1, n2, n3, n4, n5, n6, n7);
+
+    if (alloc_prop_input::has_execution_space)
+      Kokkos::Impl::ViewRemap<view_type, view_type>(
+          v_resized, v,
+          static_cast<const Impl::ViewCtorProp<
+              void, typename alloc_prop::execution_space>&>(prop_copy)
+              .value);
+    else {
+      Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+      Kokkos::fence("Kokkos::resize(View)");
+    }
 
     v = v_resized;
   }
 }
 
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value>
+resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+       Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+       const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+  impl_resize(arg_prop, v, n0, n1, n2, n3, n4, n5, n6, n7);
+}
+
 template <class T, class... P>
-inline typename std::enable_if<
+inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
-                 Kokkos::LayoutRight>::value>::type
+                 Kokkos::LayoutRight>::value>
 resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -2949,18 +3091,17 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-  impl_resize(v, n0, n1, n2, n3, n4, n5, n6, n7);
+  impl_resize(Impl::ViewCtorProp<>{}, v, n0, n1, n2, n3, n4, n5, n6, n7);
 }
 
-/** \brief  Resize a view with copying old data to new data at the corresponding
- * indices. */
 template <class I, class T, class... P>
-inline typename std::enable_if<
-    Impl::is_view_ctor_property<I>::value &&
+inline std::enable_if_t<
+    (Impl::is_view_ctor_property<I>::value ||
+     Kokkos::is_execution_space<I>::value) &&
     (std::is_same<typename Kokkos::View<T, P...>::array_layout,
                   Kokkos::LayoutLeft>::value ||
      std::is_same<typename Kokkos::View<T, P...>::array_layout,
-                  Kokkos::LayoutRight>::value)>::type
+                  Kokkos::LayoutRight>::value)>
 resize(const I& arg_prop, Kokkos::View<T, P...>& v,
        const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -2970,12 +3111,10 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
        const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-  impl_resize(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+  impl_resize(Kokkos::view_alloc(arg_prop), v, n0, n1, n2, n3, n4, n5, n6, n7);
 }
 
-/** \brief  Resize a view with copying old data to new data at the corresponding
- * indices. */
-template <class... I, class T, class... P>
+template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
@@ -2984,19 +3123,47 @@ inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutStride>::value ||
     is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
-impl_resize(Kokkos::View<T, P...>& v,
-            const typename Kokkos::View<T, P...>::array_layout& layout,
-            const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+            Kokkos::View<T, P...>& v,
+            const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::resize "
+                "must not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a memory space instance!");
 
   if (v.layout() != layout) {
-    view_type v_resized(view_alloc(v.label(), arg_prop...), layout);
-
-    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
-    Kokkos::fence("Kokkos::resize(View)");
+    // Add execution space here to avoid the need for if constexpr below
+    using alloc_prop = Impl::ViewCtorProp<
+        ViewCtorArgs..., std::string,
+        std::conditional_t<alloc_prop_input::has_execution_space,
+                           std::integral_constant<unsigned int, 10>,
+                           typename view_type::execution_space>>;
+    alloc_prop prop_copy(arg_prop);
+    static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+        v.label();
+
+    view_type v_resized(prop_copy, layout);
+
+    if (alloc_prop::has_execution_space)
+      Kokkos::Impl::ViewRemap<view_type, view_type>(
+          v_resized, v,
+          static_cast<const Impl::ViewCtorProp<
+              void, typename alloc_prop::execution_space>&>(prop_copy)
+              .value);
+    else {
+      Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+      Kokkos::fence("Kokkos::resize(View)");
+    }
 
     v = v_resized;
   }
@@ -3005,7 +3172,7 @@ impl_resize(Kokkos::View<T, P...>& v,
 // FIXME User-provided (custom) layouts are not required to have a comparison
 // operator. Hence, there is no way to check if the requested layout is actually
 // the same as the existing one.
-template <class... I, class T, class... P>
+template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
     !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
                    Kokkos::LayoutLeft>::value ||
@@ -3014,68 +3181,156 @@ inline std::enable_if_t<
       std::is_same<typename Kokkos::View<T, P...>::array_layout,
                    Kokkos::LayoutStride>::value ||
       is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
-impl_resize(Kokkos::View<T, P...>& v,
-            const typename Kokkos::View<T, P...>::array_layout& layout,
-            const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+            Kokkos::View<T, P...>& v,
+            const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
-
-  view_type v_resized(view_alloc(v.label(), arg_prop...), layout);
-
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::resize "
+                "must not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::resize must "
+                "not include a memory space instance!");
+
+  // Add execution space here to avoid the need for if constexpr below
+  using alloc_prop = Impl::ViewCtorProp<
+      ViewCtorArgs..., std::string,
+      std::conditional_t<alloc_prop_input::has_execution_space,
+                         std::integral_constant<unsigned int, 10>,
+                         typename view_type::execution_space>>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      v.label();
+
+  view_type v_resized(prop_copy, layout);
+
+  if (alloc_prop::has_execution_space)
+    Kokkos::Impl::ViewRemap<view_type, view_type>(
+        v_resized, v,
+        static_cast<const Impl::ViewCtorProp<
+            void, typename alloc_prop::execution_space>&>(prop_copy)
+            .value);
+  else {
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+  }
 
   v = v_resized;
 }
 
+template <class T, class... P, class... ViewCtorArgs>
+inline void resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+                   Kokkos::View<T, P...>& v,
+                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+  impl_resize(arg_prop, v, layout);
+}
+
 template <class I, class T, class... P>
-inline std::enable_if_t<Impl::is_view_ctor_property<I>::value> resize(
-    const I& arg_prop, Kokkos::View<T, P...>& v,
-    const typename Kokkos::View<T, P...>::array_layout& layout) {
-  impl_resize(v, layout, arg_prop);
+inline std::enable_if_t<Impl::is_view_ctor_property<I>::value ||
+                        Kokkos::is_execution_space<I>::value>
+resize(const I& arg_prop, Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
+  impl_resize(arg_prop, v, layout);
+}
+
+template <class ExecutionSpace, class T, class... P>
+inline void resize(const ExecutionSpace& exec_space, Kokkos::View<T, P...>& v,
+                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+  impl_resize(Impl::ViewCtorProp<>(), exec_space, v, layout);
 }
 
 template <class T, class... P>
 inline void resize(Kokkos::View<T, P...>& v,
                    const typename Kokkos::View<T, P...>::array_layout& layout) {
-  impl_resize(v, layout);
+  impl_resize(Impl::ViewCtorProp<>{}, v, layout);
 }
 
 /** \brief  Resize a view with discarding old data. */
-template <class... I, class T, class... P>
-inline typename std::enable_if<
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
-                 Kokkos::LayoutRight>::value>::type
+                 Kokkos::LayoutRight>::value>
 impl_realloc(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1,
              const size_t n2, const size_t n3, const size_t n4, const size_t n5,
-             const size_t n6, const size_t n7, const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+             const size_t n6, const size_t n7,
+             const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a memory space instance!");
 
   const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
   const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
   if (sizeMismatch) {
-    const std::string label = v.label();
-
-    v = view_type();  // Deallocate first, if the only view to allocation
-    v = view_type(view_alloc(label, arg_prop...), n0, n1, n2, n3, n4, n5, n6,
-                  n7);
-  } else if (!Kokkos::Impl::has_type<Impl::WithoutInitializing_t, I...>::value)
-    Kokkos::deep_copy(v, typename view_type::value_type{});
+    using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+    alloc_prop arg_prop_copy(arg_prop);
+    static_cast<Kokkos::Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy)
+        .value = v.label();
+    v = view_type();  // Best effort to deallocate in case no other view refers
+                      // to the shared allocation
+    v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7);
+  } else if (alloc_prop_input::initialize) {
+    if (alloc_prop_input::has_execution_space) {
+      using alloc_prop = Impl::ViewCtorProp<
+          ViewCtorArgs...,
+          std::conditional_t<alloc_prop_input::has_execution_space,
+                             std::integral_constant<unsigned int, 2>,
+                             typename view_type::execution_space>>;
+      alloc_prop arg_prop_copy(arg_prop);
+      auto const& exec_space = static_cast<Kokkos::Impl::ViewCtorProp<
+          void, typename alloc_prop::execution_space> const&>(arg_prop_copy)
+                                   .value;
+      Kokkos::deep_copy(exec_space, v, typename view_type::value_type{});
+    } else
+      Kokkos::deep_copy(v, typename view_type::value_type{});
+  }
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value>
+realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+        Kokkos::View<T, P...>& v,
+        const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+        const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
 }
 
 template <class T, class... P>
-inline typename std::enable_if<
+inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
-                 Kokkos::LayoutRight>::value>::type
+                 Kokkos::LayoutRight>::value>
 realloc(Kokkos::View<T, P...>& v,
         const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -3085,16 +3340,16 @@ realloc(Kokkos::View<T, P...>& v,
         const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7);
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, Impl::ViewCtorProp<>{});
 }
 
 template <class I, class T, class... P>
-inline typename std::enable_if<
+inline std::enable_if_t<
     Impl::is_view_ctor_property<I>::value &&
     (std::is_same<typename Kokkos::View<T, P...>::array_layout,
                   Kokkos::LayoutLeft>::value ||
      std::is_same<typename Kokkos::View<T, P...>::array_layout,
-                  Kokkos::LayoutRight>::value)>::type
+                  Kokkos::LayoutRight>::value)>
 realloc(const I& arg_prop, Kokkos::View<T, P...>& v,
         const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -3104,10 +3359,10 @@ realloc(const I& arg_prop, Kokkos::View<T, P...>& v,
         const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, arg_prop);
+  impl_realloc(v, n0, n1, n2, n3, n4, n5, n6, n7, Kokkos::view_alloc(arg_prop));
 }
 
-template <class... I, class T, class... P>
+template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
     std::is_same<typename Kokkos::View<T, P...>::array_layout,
                  Kokkos::LayoutLeft>::value ||
@@ -3118,24 +3373,53 @@ inline std::enable_if_t<
     is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
 impl_realloc(Kokkos::View<T, P...>& v,
              const typename Kokkos::View<T, P...>::array_layout& layout,
-             const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+             const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a memory space instance!");
 
   if (v.layout() != layout) {
-    const std::string label = v.label();
-
     v = view_type();  // Deallocate first, if the only view to allocation
-    v = view_type(view_alloc(label, arg_prop...), layout);
+    v = view_type(arg_prop, layout);
+  } else if (alloc_prop_input::initialize) {
+    if (alloc_prop_input::has_execution_space) {
+      // Add execution_space if not provided to avoid need for if constexpr
+      using alloc_prop = Impl::ViewCtorProp<
+          ViewCtorArgs...,
+          std::conditional_t<alloc_prop_input::has_execution_space,
+                             std::integral_constant<unsigned int, 2>,
+                             typename view_type::execution_space>,
+          std::string>;
+      alloc_prop arg_prop_copy(arg_prop);
+      static_cast<Kokkos::Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy)
+          .value                 = v.label();
+      using execution_space_type = typename alloc_prop::execution_space;
+      const execution_space_type& exec_space =
+          static_cast<
+              Kokkos::Impl::ViewCtorProp<void, execution_space_type> const&>(
+              arg_prop_copy)
+              .value;
+      Kokkos::deep_copy(exec_space, v, typename view_type::value_type{});
+    } else
+      Kokkos::deep_copy(v, typename view_type::value_type{});
   }
 }
 
 // FIXME User-provided (custom) layouts are not required to have a comparison
 // operator. Hence, there is no way to check if the requested layout is actually
 // the same as the existing one.
-template <class... I, class T, class... P>
+template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
     !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
                    Kokkos::LayoutLeft>::value ||
@@ -3146,30 +3430,51 @@ inline std::enable_if_t<
       is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
 impl_realloc(Kokkos::View<T, P...>& v,
              const typename Kokkos::View<T, P...>::array_layout& layout,
-             const I&... arg_prop) {
-  using view_type = Kokkos::View<T, P...>;
+             const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using view_type        = Kokkos::View<T, P...>;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
-
-  const std::string label = v.label();
+  static_assert(!alloc_prop_input::has_label,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a label!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::has_memory_space,
+                "The view constructor arguments passed to Kokkos::realloc must "
+                "not include a memory space instance!");
 
   v = view_type();  // Deallocate first, if the only view to allocation
-  v = view_type(view_alloc(label, arg_prop...), layout);
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop arg_prop_copy(arg_prop);
+  static_cast<Kokkos::Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy)
+      .value = v.label();
+  v          = view_type(arg_prop_copy, layout);
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+inline void realloc(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    Kokkos::View<T, P...>& v,
+    const typename Kokkos::View<T, P...>::array_layout& layout) {
+  impl_realloc(v, layout, arg_prop);
 }
 
 template <class I, class T, class... P>
 inline std::enable_if_t<Impl::is_view_ctor_property<I>::value> realloc(
     const I& arg_prop, Kokkos::View<T, P...>& v,
     const typename Kokkos::View<T, P...>::array_layout& layout) {
-  impl_realloc(v, layout, arg_prop);
+  impl_realloc(v, layout, Kokkos::view_alloc(arg_prop));
 }
 
 template <class T, class... P>
 inline void realloc(
     Kokkos::View<T, P...>& v,
     const typename Kokkos::View<T, P...>::array_layout& layout) {
-  impl_realloc(v, layout);
+  impl_realloc(v, layout, Impl::ViewCtorProp<>{});
 }
 
 } /* namespace Kokkos */
@@ -3201,8 +3506,8 @@ struct MirrorViewType {
   using dest_view_type = Kokkos::View<data_type, array_layout, Space>;
   // If it is the same memory_space return the existsing view_type
   // This will also keep the unmanaged trait if necessary
-  using view_type = typename std::conditional<is_same_memspace, src_view_type,
-                                              dest_view_type>::type;
+  using view_type =
+      std::conditional_t<is_same_memspace, src_view_type, dest_view_type>;
 };
 
 template <class Space, class T, class... P>
@@ -3225,18 +3530,38 @@ struct MirrorType {
   using view_type = Kokkos::View<data_type, array_layout, Space>;
 };
 
-template <class T, class... P, class... I>
-inline typename std::enable_if<
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
     !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                  Kokkos::LayoutStride>::value,
-    typename Kokkos::View<T, P...>::HostMirror>::type
-create_mirror(const Kokkos::View<T, P...>& src, const I&... arg_prop) {
-  using src_type = View<T, P...>;
-  using dst_type = typename src_type::HostMirror;
+                  Kokkos::LayoutStride>::value &&
+        !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror(const Kokkos::View<T, P...>& src,
+              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using src_type         = View<T, P...>;
+  using dst_type         = typename src_type::HostMirror;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
 
   return dst_type(
-      Kokkos::view_alloc(std::string(src.label()).append("_mirror"),
-                         arg_prop...),
+      prop_copy,
       src.rank_dynamic > 0 ? src.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       src.rank_dynamic > 1 ? src.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       src.rank_dynamic > 2 ? src.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -3247,14 +3572,30 @@ create_mirror(const Kokkos::View<T, P...>& src, const I&... arg_prop) {
       src.rank_dynamic > 7 ? src.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG);
 }
 
-template <class T, class... P, class... I>
-inline typename std::enable_if<
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
     std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
-                 Kokkos::LayoutStride>::value,
-    typename Kokkos::View<T, P...>::HostMirror>::type
-create_mirror(const Kokkos::View<T, P...>& src, const I&... arg_prop) {
-  using src_type = View<T, P...>;
-  using dst_type = typename src_type::HostMirror;
+                 Kokkos::LayoutStride>::value &&
+        !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror(const Kokkos::View<T, P...>& src,
+              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using src_type         = View<T, P...>;
+  using dst_type         = typename src_type::HostMirror;
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
 
   Kokkos::LayoutStride layout;
 
@@ -3276,123 +3617,199 @@ create_mirror(const Kokkos::View<T, P...>& src, const I&... arg_prop) {
   layout.stride[6] = src.stride_6();
   layout.stride[7] = src.stride_7();
 
-  return dst_type(Kokkos::view_alloc(std::string(src.label()).append("_mirror"),
-                                     arg_prop...),
-                  layout);
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  return dst_type(prop_copy, layout);
 }
 
 // Create a mirror in a new space (specialization for different space)
-template <class Space, class T, class... P, class... I>
-typename Impl::MirrorType<Space, T, P...>::view_type create_mirror(
-    const Space&, const Kokkos::View<T, P...>& src, const I&... arg_prop) {
-  return typename Impl::MirrorType<Space, T, P...>::view_type(
-      Kokkos::view_alloc(src.label(), arg_prop...), src.layout());
+template <class T, class... P, class... ViewCtorArgs,
+          class Enable = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+auto create_mirror(const Kokkos::View<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+
+  static_assert(
+      !alloc_prop_input::has_label,
+      "The view constructor arguments passed to Kokkos::create_mirror "
+      "must not include a label!");
+  static_assert(
+      !alloc_prop_input::has_pointer,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not include a pointer!");
+  static_assert(
+      !alloc_prop_input::allow_padding,
+      "The view constructor arguments passed to Kokkos::create_mirror must "
+      "not explicitly allow padding!");
+
+  using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., std::string>;
+  alloc_prop prop_copy(arg_prop);
+  static_cast<Impl::ViewCtorProp<void, std::string>&>(prop_copy).value =
+      std::string(src.label()).append("_mirror");
+
+  return typename Impl::MirrorType<typename alloc_prop::memory_space, T,
+                                   P...>::view_type(prop_copy, src.layout());
 }
 }  // namespace Impl
 
 template <class T, class... P>
-std::enable_if_t<
-    std::is_same<typename ViewTraits<T, P...>::specialize, void>::value,
-    typename Kokkos::View<T, P...>::HostMirror>
+std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value,
+                 typename Kokkos::View<T, P...>::HostMirror>
 create_mirror(Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror(v);
+  return Impl::create_mirror(v, Impl::ViewCtorProp<>{});
 }
 
 template <class T, class... P>
-std::enable_if_t<
-    std::is_same<typename ViewTraits<T, P...>::specialize, void>::value,
-    typename Kokkos::View<T, P...>::HostMirror>
+std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value,
+                 typename Kokkos::View<T, P...>::HostMirror>
 create_mirror(Kokkos::Impl::WithoutInitializing_t wi,
               Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror(v, wi);
+  return Impl::create_mirror(v, view_alloc(wi));
 }
 
 template <class Space, class T, class... P,
           typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value,
+                 typename Impl::MirrorType<Space, T, P...>::view_type>
+create_mirror(Space const&, Kokkos::View<T, P...> const& v) {
+  return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}));
+}
+
+template <class T, class... P, class... ViewCtorArgs,
+          typename Enable = std::enable_if_t<
+              std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+auto create_mirror(Impl::ViewCtorProp<ViewCtorArgs...> const& arg_prop,
+                   Kokkos::View<T, P...> const& v) {
+  return Impl::create_mirror(v, arg_prop);
+}
+
+template <class T, class... P, class... ViewCtorArgs>
 std::enable_if_t<
-    std::is_same<typename ViewTraits<T, P...>::specialize, void>::value,
-    typename Impl::MirrorType<Space, T, P...>::view_type>
-create_mirror(Space const& space, Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror(space, v);
+    std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror(Impl::ViewCtorProp<ViewCtorArgs...> const& arg_prop,
+              Kokkos::View<T, P...> const& v) {
+  return Impl::create_mirror(v, arg_prop);
 }
 
 template <class Space, class T, class... P,
           typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
-std::enable_if_t<
-    std::is_same<typename ViewTraits<T, P...>::specialize, void>::value,
-    typename Impl::MirrorType<Space, T, P...>::view_type>
-create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
+std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value,
+                 typename Impl::MirrorType<Space, T, P...>::view_type>
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&,
               Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror(space, v, wi);
+  return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}, wi));
 }
 
 namespace Impl {
 
-template <class T, class... P, class... I>
-inline typename std::enable_if<
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
     (std::is_same<
          typename Kokkos::View<T, P...>::memory_space,
          typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
      std::is_same<
          typename Kokkos::View<T, P...>::data_type,
          typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
-    typename Kokkos::View<T, P...>::HostMirror>::type
-create_mirror_view(const Kokkos::View<T, P...>& src, const I&...) {
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::View<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
-template <class T, class... P, class... I>
-inline typename std::enable_if<
+template <class T, class... P, class... ViewCtorArgs>
+inline std::enable_if_t<
     !(std::is_same<
           typename Kokkos::View<T, P...>::memory_space,
           typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
       std::is_same<
           typename Kokkos::View<T, P...>::data_type,
           typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
-    typename Kokkos::View<T, P...>::HostMirror>::type
-create_mirror_view(const Kokkos::View<T, P...>& src, const I&... arg_prop) {
-  return Kokkos::create_mirror(arg_prop..., src);
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::View<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 
 // Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P, class... I>
-typename std::enable_if<
-    Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorViewType<Space, T, P...>::view_type>::type
+template <class Space, class T, class... P, class... ViewCtorArgs>
+std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
 create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
-                   const I&...) {
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
 // Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P, class... I>
-typename std::enable_if<
-    !Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorViewType<Space, T, P...>::view_type>::type
+template <class Space, class T, class... P, class... ViewCtorArgs>
+std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
 create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
-                   const I&... arg_prop) {
-  return typename Impl::MirrorViewType<Space, T, P...>::view_type(
-      Kokkos::view_alloc(src.label(), arg_prop...), src.layout());
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  using MemorySpace = typename Space::memory_space;
+  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
+  alloc_prop prop_copy(arg_prop);
+
+  return Kokkos::Impl::create_mirror(src, prop_copy);
 }
 }  // namespace Impl
 
 template <class T, class... P>
-typename Kokkos::View<T, P...>::HostMirror create_mirror_view(
-    Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(v);
+std::enable_if_t<
+    std::is_same<
+        typename Kokkos::View<T, P...>::memory_space,
+        typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
+        std::is_same<
+            typename Kokkos::View<T, P...>::data_type,
+            typename Kokkos::View<T, P...>::HostMirror::data_type>::value,
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::View<T, P...>& src) {
+  return src;
+}
+
+template <class T, class... P>
+std::enable_if_t<
+    !(std::is_same<
+          typename Kokkos::View<T, P...>::memory_space,
+          typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
+      std::is_same<
+          typename Kokkos::View<T, P...>::data_type,
+          typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
+    typename Kokkos::View<T, P...>::HostMirror>
+create_mirror_view(const Kokkos::View<T, P...>& src) {
+  return Kokkos::create_mirror(src);
 }
 
 template <class T, class... P>
 typename Kokkos::View<T, P...>::HostMirror create_mirror_view(
     Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(v, wi);
+  return Impl::create_mirror_view(v, view_alloc(wi));
 }
 
+// FIXME_C++17 Improve SFINAE here.
 template <class Space, class T, class... P,
-          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+          class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
+typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
+    const Space&, const Kokkos::View<T, P...>& src,
+    std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* =
+        nullptr) {
+  return src;
+}
+
+// FIXME_C++17 Improve SFINAE here.
+template <class Space, class T, class... P,
+          class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
-    Space const& space, Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(space, v);
+    const Space& space, const Kokkos::View<T, P...>& src,
+    std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* =
+        nullptr) {
+  return Kokkos::create_mirror(space, src);
 }
 
 template <class Space, class T, class... P,
@@ -3400,43 +3817,112 @@ template <class Space, class T, class... P,
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
     Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
     Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(space, v, wi);
+  return Impl::create_mirror_view(space, v, view_alloc(wi));
+}
+
+template <class T, class... P, class... ViewCtorArgs>
+auto create_mirror_view(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+                        const Kokkos::View<T, P...>& v) {
+  return Impl::create_mirror_view(v, arg_prop);
+}
+
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>&,
+    const Kokkos::View<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        Impl::MirrorViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+
+  // same behavior as deep_copy(src, src)
+  if (!alloc_prop_input::has_execution_space)
+    fence(
+        "Kokkos::create_mirror_view_and_copy: fence before returning src view");
+  return src;
 }
 
-// Create a mirror view and deep_copy in a new space (specialization for same
-// space)
-template <class Space, class T, class... P>
-typename Impl::MirrorViewType<Space, T, P...>::view_type
-create_mirror_view_and_copy(
-    const Space&, const Kokkos::View<T, P...>& src,
-    std::string const& name = "",
-    typename std::enable_if<
-        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
-        Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
-        nullptr) {
-  (void)name;
-  fence(
-      "Kokkos::create_mirror_view_and_copy: fence before returning src view");  // same behavior as deep_copy(src, src)
-  return src;
+template <class... ViewCtorArgs, class T, class... P>
+auto create_mirror_view_and_copy(
+    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop,
+    const Kokkos::View<T, P...>& src,
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value &&
+        !Impl::MirrorViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace>* = nullptr) {
+  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  static_assert(
+      alloc_prop_input::has_memory_space,
+      "The view constructor arguments passed to "
+      "Kokkos::create_mirror_view_and_copy must include a memory space!");
+  static_assert(!alloc_prop_input::has_pointer,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not include a pointer!");
+  static_assert(!alloc_prop_input::allow_padding,
+                "The view constructor arguments passed to "
+                "Kokkos::create_mirror_view_and_copy must "
+                "not explicitly allow padding!");
+  using Space  = typename alloc_prop_input::memory_space;
+  using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type;
+
+  // Add some properties if not provided to avoid need for if constexpr
+  using alloc_prop = Impl::ViewCtorProp<
+      ViewCtorArgs...,
+      std::conditional_t<alloc_prop_input::has_label,
+                         std::integral_constant<unsigned int, 12>, std::string>,
+      std::conditional_t<!alloc_prop_input::initialize,
+                         std::integral_constant<unsigned int, 13>,
+                         Impl::WithoutInitializing_t>,
+      std::conditional_t<alloc_prop_input::has_execution_space,
+                         std::integral_constant<unsigned int, 14>,
+                         typename Space::execution_space>>;
+  alloc_prop arg_prop_copy(arg_prop);
+
+  std::string& label =
+      static_cast<Impl::ViewCtorProp<void, std::string>&>(arg_prop_copy).value;
+  if (label.empty()) label = src.label();
+  auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()};
+  if (alloc_prop_input::has_execution_space) {
+    using ExecutionSpace = typename alloc_prop::execution_space;
+    deep_copy(
+        static_cast<Impl::ViewCtorProp<void, ExecutionSpace>&>(arg_prop_copy)
+            .value,
+        mirror, src);
+  } else
+    deep_copy(mirror, src);
+  return mirror;
 }
 
-// Create a mirror view and deep_copy in a new space (specialization for
-// different space)
-template <class Space, class T, class... P>
+// Previously when using auto here, the intel compiler 19.3 would
+// sometimes not create a symbol, guessing that it somehow is a combination
+// of auto and just forwarding arguments (see issue #5196)
+template <class Space, class T, class... P,
+          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 typename Impl::MirrorViewType<Space, T, P...>::view_type
 create_mirror_view_and_copy(
     const Space&, const Kokkos::View<T, P...>& src,
     std::string const& name = "",
-    typename std::enable_if<
-        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
-        !Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
+    std::enable_if_t<
+        std::is_void<typename ViewTraits<T, P...>::specialize>::value>* =
         nullptr) {
-  using Mirror      = typename Impl::MirrorViewType<Space, T, P...>::view_type;
-  std::string label = name.empty() ? src.label() : name;
-  auto mirror       = typename Mirror::non_const_type{
-      view_alloc(WithoutInitializing, label), src.layout()};
-  deep_copy(mirror, src);
-  return mirror;
+  return create_mirror_view_and_copy(
+      Kokkos::view_alloc(typename Space::memory_space{}, name), src);
 }
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@@ -3448,8 +3934,7 @@ KOKKOS_DEPRECATED_WITH_COMMENT(
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
     const Space&, const Kokkos::View<T, P...>& src,
     Kokkos::Impl::WithoutInitializing_t,
-    typename std::enable_if<
-        Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
+    std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* =
         nullptr) {
   return src;
 }
@@ -3462,8 +3947,7 @@ KOKKOS_DEPRECATED_WITH_COMMENT(
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
     const Space&, const Kokkos::View<T, P...>& src,
     Kokkos::Impl::WithoutInitializing_t,
-    typename std::enable_if<
-        !Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
+    std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* =
         nullptr) {
   using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type;
   return Mirror(view_alloc(WithoutInitializing, src.label()), src.layout());
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
index 232873d3f..3a9aaafbc 100644
--- a/packages/kokkos/core/src/Kokkos_Core.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_CORE_HPP
 #define KOKKOS_CORE_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
+#endif
 
 //----------------------------------------------------------------------------
 // Include the execution space header files for the enabled execution spaces.
@@ -71,9 +75,9 @@
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Complex.hpp>
 #include <Kokkos_CopyViews.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 #include <functional>
 #include <iosfwd>
-#include <map>
 #include <memory>
 #include <vector>
 
@@ -81,87 +85,16 @@
 
 namespace Kokkos {
 
-struct InitArguments {
-  int num_threads;
-  int num_numa;
-  int device_id;
-  int ndevices;
-  int skip_device;
-  bool disable_warnings;
-  bool tune_internals;
-  bool tool_help        = false;
-  std::string tool_lib  = {};
-  std::string tool_args = {};
-
-  InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false,
-                bool ti = false)
-      : num_threads{nt},
-        num_numa{nn},
-        device_id{dv},
-        ndevices{-1},
-        skip_device{9999},
-        disable_warnings{dw},
-        tune_internals{ti} {}
-  Tools::InitArguments impl_get_tools_init_arguments() const {
-    Tools::InitArguments init_tools;
-    init_tools.tune_internals =
-        tune_internals ? Tools::InitArguments::PossiblyUnsetOption::on
-                       : Tools::InitArguments::PossiblyUnsetOption::unset;
-    init_tools.help = tool_help
-                          ? Tools::InitArguments::PossiblyUnsetOption::on
-                          : Tools::InitArguments::PossiblyUnsetOption::unset;
-    init_tools.lib = tool_lib.empty()
-                         ? Kokkos::Tools::InitArguments::unset_string_option
-                         : tool_lib;
-    init_tools.args = tool_args.empty()
-                          ? Kokkos::Tools::InitArguments::unset_string_option
-                          : tool_args;
-    return init_tools;
-  }
-};
-
-namespace Impl {
+void initialize(int& argc, char* argv[]);
 
-/* ExecSpaceManager - Responsible for initializing all of the registered
- * backends. Backends are registered using the register_space_initializer()
- * function which should be called from a global context so that it is called
- * prior to initialize_spaces() which is called from Kokkos::initialize()
- */
-class ExecSpaceManager {
-  std::map<std::string, std::unique_ptr<ExecSpaceInitializerBase>>
-      exec_space_factory_list;
-
- public:
-  ExecSpaceManager() = default;
-
-  void register_space_factory(std::string name,
-                              std::unique_ptr<ExecSpaceInitializerBase> ptr);
-  void initialize_spaces(const Kokkos::InitArguments& args);
-  void finalize_spaces(const bool all_spaces);
-  void static_fence();
-  void static_fence(const std::string&);
-  void print_configuration(std::ostream& msg, const bool detail);
-  static ExecSpaceManager& get_instance();
-};
-
-template <class SpaceInitializerType>
-int initialize_space_factory(std::string name) {
-  auto space_ptr = std::make_unique<SpaceInitializerType>();
-  ExecSpaceManager::get_instance().register_space_factory(name,
-                                                          std::move(space_ptr));
-  return 1;
-}
-
-}  // namespace Impl
-void initialize(int& narg, char* arg[]);
-
-void initialize(InitArguments args = InitArguments());
+void initialize(
+    InitializationSettings const& settings = InitializationSettings());
 
 namespace Impl {
 
-void pre_initialize(const InitArguments& args);
+void pre_initialize(const InitializationSettings& settings);
 
-void post_initialize(const InitArguments& args);
+void post_initialize(const InitializationSettings& settings);
 
 void declare_configuration_metadata(const std::string& category,
                                     const std::string& key,
@@ -169,7 +102,8 @@ void declare_configuration_metadata(const std::string& category,
 
 }  // namespace Impl
 
-bool is_initialized() noexcept;
+KOKKOS_ATTRIBUTE_NODISCARD bool is_initialized() noexcept;
+KOKKOS_ATTRIBUTE_NODISCARD bool is_finalized() noexcept;
 
 bool show_warnings() noexcept;
 bool tune_internals() noexcept;
@@ -199,14 +133,13 @@ void finalize();
  */
 void push_finalize_hook(std::function<void()> f);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 /** \brief  Finalize all known execution spaces */
-void finalize_all();
-
-void fence();
-void fence(const std::string&);
+KOKKOS_DEPRECATED void finalize_all();
+#endif
 
 /** \brief Print "Bill of Materials" */
-void print_configuration(std::ostream&, const bool detail = false);
+void print_configuration(std::ostream& os, bool verbose = false);
 
 }  // namespace Kokkos
 
@@ -219,7 +152,7 @@ namespace Kokkos {
  * The allocation is tracked in Kokkos memory tracking system, so
  * leaked memory can be identified.
  */
-template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space>
+template <class Space = Kokkos::DefaultExecutionSpace::memory_space>
 inline void* kokkos_malloc(const std::string& arg_alloc_label,
                            const size_t arg_alloc_size) {
   using MemorySpace = typename Space::memory_space;
@@ -227,21 +160,21 @@ inline void* kokkos_malloc(const std::string& arg_alloc_label,
       MemorySpace(), arg_alloc_label, arg_alloc_size);
 }
 
-template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space>
+template <class Space = Kokkos::DefaultExecutionSpace::memory_space>
 inline void* kokkos_malloc(const size_t arg_alloc_size) {
   using MemorySpace = typename Space::memory_space;
   return Impl::SharedAllocationRecord<MemorySpace>::allocate_tracked(
       MemorySpace(), "no-label", arg_alloc_size);
 }
 
-template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space>
+template <class Space = Kokkos::DefaultExecutionSpace::memory_space>
 inline void kokkos_free(void* arg_alloc) {
   using MemorySpace = typename Space::memory_space;
   return Impl::SharedAllocationRecord<MemorySpace>::deallocate_tracked(
       arg_alloc);
 }
 
-template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space>
+template <class Space = Kokkos::DefaultExecutionSpace::memory_space>
 inline void* kokkos_realloc(void* arg_alloc, const size_t arg_alloc_size) {
   using MemorySpace = typename Space::memory_space;
   return Impl::SharedAllocationRecord<MemorySpace>::reallocate_tracked(
@@ -260,37 +193,153 @@ namespace Kokkos {
  *     if Kokkos::is_initialized() in the constructor, don't call
  * Kokkos::initialize or Kokkos::finalize it is not copyable or assignable
  */
+namespace Impl {
+
+inline std::string scopeguard_correct_usage() {
+  return std::string(
+      "Do instead:\n"
+      "  std::unique_ptr<Kokkos::ScopeGuard> guard =\n"
+      "    !Kokkos::is_initialized() && !Kokkos::is_finalized()?\n"
+      "    new ScopeGuard(argc,argv) : nullptr;\n");
+}
+
+inline std::string scopeguard_create_while_initialized_warning() {
+  return std::string(
+             "Kokkos Error: Creating a ScopeGuard while Kokkos is initialized "
+             "is illegal.\n")
+      .append(scopeguard_correct_usage());
+}
+
+inline std::string scopeguard_create_after_finalize_warning() {
+  return std::string(
+             "Kokkos Error: Creating a ScopeGuard after Kokkos was finalized "
+             "is illegal.\n")
+      .append(scopeguard_correct_usage());
+}
 
-class ScopeGuard {
+inline std::string scopeguard_destruct_after_finalize_warning() {
+  return std::string(
+             "Kokkos Error: Destroying a ScopeGuard after Kokkos was finalized "
+             "is illegal.\n")
+      .append(scopeguard_correct_usage());
+}
+
+}  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard {
  public:
-  ScopeGuard(int& narg, char* arg[]) {
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  KOKKOS_ATTRIBUTE_NODISCARD
+#endif
+  ScopeGuard(int& argc, char* argv[]) {
     sg_init = false;
-    if (!Kokkos::is_initialized()) {
-      initialize(narg, arg);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    if (is_initialized()) {
+      std::cerr << Impl::scopeguard_create_while_initialized_warning()
+                << std::endl;
+    }
+    if (is_finalized()) {
+      std::cerr << Impl::scopeguard_create_after_finalize_warning()
+                << std::endl;
+    }
+#endif
+    if (!is_initialized()) {
+      initialize(argc, argv);
       sg_init = true;
     }
   }
 
-  ScopeGuard(const InitArguments& args = InitArguments()) {
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  KOKKOS_ATTRIBUTE_NODISCARD
+#endif
+  explicit ScopeGuard(
+      const InitializationSettings& settings = InitializationSettings()) {
     sg_init = false;
-    if (!Kokkos::is_initialized()) {
-      initialize(args);
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    if (is_initialized()) {
+      std::cerr << Impl::scopeguard_create_while_initialized_warning()
+                << std::endl;
+    }
+    if (is_finalized()) {
+      std::cerr << Impl::scopeguard_create_after_finalize_warning()
+                << std::endl;
+    }
+#endif
+    if (!is_initialized()) {
+      initialize(settings);
       sg_init = true;
     }
   }
 
   ~ScopeGuard() {
-    if (Kokkos::is_initialized() && sg_init) {
+#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
+    if (is_finalized()) {
+      std::cerr << Impl::scopeguard_destruct_after_finalize_warning()
+                << std::endl;
+    }
+#endif
+    if (is_initialized() && sg_init) {
       finalize();
     }
   }
 
-  // private:
+ private:
   bool sg_init;
 
+ public:
+  ScopeGuard& operator=(const ScopeGuard&) = delete;
+  ScopeGuard& operator=(ScopeGuard&&) = delete;
+  ScopeGuard(const ScopeGuard&)       = delete;
+  ScopeGuard(ScopeGuard&&)            = delete;
+};
+
+#else  // ifndef KOKKOS_ENABLE_DEPRECATED_CODE3
+
+class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard {
+ public:
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  KOKKOS_ATTRIBUTE_NODISCARD
+#endif
+  ScopeGuard(int& argc, char* argv[]) {
+    if (is_initialized()) {
+      Kokkos::abort(
+          Impl::scopeguard_create_while_initialized_warning().c_str());
+    }
+    if (is_finalized()) {
+      Kokkos::abort(Impl::scopeguard_create_after_finalize_warning().c_str());
+    }
+    initialize(argc, argv);
+  }
+
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907
+  KOKKOS_ATTRIBUTE_NODISCARD
+#endif
+  ScopeGuard(
+      const InitializationSettings& settings = InitializationSettings()) {
+    if (is_initialized()) {
+      Kokkos::abort(
+          Impl::scopeguard_create_while_initialized_warning().c_str());
+    }
+    if (is_finalized()) {
+      Kokkos::abort(Impl::scopeguard_create_after_finalize_warning().c_str());
+    }
+    initialize(settings);
+  }
+
+  ~ScopeGuard() {
+    if (is_finalized()) {
+      Kokkos::abort(Impl::scopeguard_destruct_after_finalize_warning().c_str());
+    }
+    finalize();
+  }
+
   ScopeGuard& operator=(const ScopeGuard&) = delete;
-  ScopeGuard(const ScopeGuard&)            = delete;
+  ScopeGuard& operator=(ScopeGuard&&) = delete;
+  ScopeGuard(const ScopeGuard&)       = delete;
+  ScopeGuard(ScopeGuard&&)            = delete;
 };
+#endif
 
 }  // namespace Kokkos
 
@@ -343,9 +392,14 @@ std::vector<ExecSpace> partition_space(ExecSpace space,
 // implementation of the RAII wrapper is using Kokkos::single.
 #include <Kokkos_AcquireUniqueTokenImpl.hpp>
 
-// Specializations requires after core definitions
+// Specializations required after core definitions
 #include <KokkosCore_Config_PostInclude.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
index d04e6a75c..2bb323b4a 100644
--- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_CORE_FWD_HPP
 #define KOKKOS_CORE_FWD_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE_FWD
+#endif
 
 //----------------------------------------------------------------------------
 // Kokkos_Macros.hpp does introspection on configuration options
@@ -94,7 +98,10 @@ template <class ExecutionSpace, class MemorySpace>
 struct Device;
 
 // forward declare here so that backend initializer calls can use it.
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 struct InitArguments;
+#endif
+class InitializationSettings;
 
 }  // namespace Kokkos
 
@@ -132,6 +139,9 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION =
 #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL)
 using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION =
     Experimental::SYCL;
+#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC)
+using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION =
+    Experimental::OpenACC;
 #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP)
 using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = OpenMP;
 #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS)
@@ -143,7 +153,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION =
 using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial;
 #else
 #error \
-    "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial."
+    "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial."
 #endif
 
 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP)
@@ -199,7 +209,7 @@ struct RuntimeCheckMemoryAccessViolation {
 // explicit specialization: memory access violation will occur, call abort with
 // the specified error message.
 template <class MemorySpace, class AccessSpace>
-struct RuntimeCheckMemoryAccessViolation<AccessSpace, MemorySpace, false> {
+struct RuntimeCheckMemoryAccessViolation<MemorySpace, AccessSpace, false> {
   KOKKOS_FUNCTION RuntimeCheckMemoryAccessViolation(char const *const msg) {
     Kokkos::abort(msg);
   }
@@ -267,9 +277,6 @@ struct verify_space<DstMemorySpace, SrcMemorySpace, false> {
 };
 #endif
 
-// Base class for exec space initializer factories
-class ExecSpaceInitializerBase;
-
 }  // namespace Impl
 
 namespace Experimental {
@@ -292,8 +299,15 @@ class LogicalMemorySpace;
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+// Getting ICE in Trilinos in Sacado and Intrepid in deep_copy
+// See issue https://github.com/kokkos/kokkos/issues/5290
+// Simply taking string by value did not resolve the issue
+#ifdef KOKKOS_COMPILER_INTEL
 void fence();
-void fence(const std::string &);
+void fence(const std::string &name);
+#else
+void fence(const std::string &name = "Kokkos::fence: Unnamed Global Fence");
+#endif
 }  // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -354,12 +368,12 @@ class ParallelReduce;
 /// skip this and go directly to the documentation of the nonmember
 /// template function Kokkos::parallel_scan.
 template <class FunctorType, class ExecPolicy,
-          class ExecutionSapce = typename Impl::FunctorPolicyExecutionSpace<
+          class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace<
               FunctorType, ExecPolicy>::execution_space>
 class ParallelScan;
 
 template <class FunctorType, class ExecPolicy, class ReturnType = InvalidType,
-          class ExecutionSapce = typename Impl::FunctorPolicyExecutionSpace<
+          class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace<
               FunctorType, ExecPolicy>::execution_space>
 class ParallelScanWithTotal;
 
@@ -418,4 +432,8 @@ template <class Index, class Space = HostSpace>
 struct StdPartitionPoint;
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE_FWD
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE_FWD
+#endif
 #endif /* #ifndef KOKKOS_CORE_FWD_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp
index 0657146bb..9c0d1f682 100644
--- a/packages/kokkos/core/src/Kokkos_Crs.hpp
+++ b/packages/kokkos/core/src/Kokkos_Crs.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_CRS_HPP
 #define KOKKOS_CRS_HPP
 
@@ -213,8 +222,7 @@ class CrsRowMapFromCounts {
   KOKKOS_INLINE_FUNCTION
   void init(value_type& update) const { update = 0; }
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update,
-            const volatile value_type& input) const {
+  void join(value_type& update, const value_type& input) const {
     update += input;
   }
   using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
index 0063b1cd1..72a00f41b 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_CUDA_HPP
 #define KOKKOS_CUDA_HPP
 
@@ -62,8 +71,8 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -183,17 +192,16 @@ class Cuda {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence();
-  static void impl_static_fence(const std::string&);
+  static void impl_static_fence(const std::string& name);
 
-  void fence() const;
-  void fence(const std::string&) const;
+  void fence(const std::string& name =
+                 "Kokkos::Cuda::fence(): Unnamed Instance Fence") const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
   //! Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   //@}
   //--------------------------------------------------
@@ -204,15 +212,6 @@ class Cuda {
   Cuda(cudaStream_t stream, bool manage_stream = false);
 
   //--------------------------------------------------------------------------
-  //! \name Device-specific functions
-  //@{
-
-  struct SelectDevice {
-    int cuda_device_id;
-    SelectDevice() : cuda_device_id(0) {}
-    explicit SelectDevice(int id) : cuda_device_id(id) {}
-  };
-
   //! Free any resources being consumed by the device.
   static void impl_finalize();
 
@@ -220,8 +219,7 @@ class Cuda {
   static int impl_is_initialized();
 
   //! Initialize, telling the CUDA run-time library which device to use.
-  static void impl_initialize(const SelectDevice         = SelectDevice(),
-                              const size_t num_instances = 1);
+  static void impl_initialize(InitializationSettings const&);
 
   /// \brief Cuda device architecture of the selected device.
   ///
@@ -267,17 +265,6 @@ struct DeviceTypeTraits<Cuda> {
 
 namespace Impl {
 
-class CudaSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  CudaSpaceInitializer()  = default;
-  ~CudaSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool all_spaces) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
 template <class DT, class... DP>
 struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
   ZeroMemset(const Kokkos::Cuda& exec_space_instance,
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
index 910a8b2d7..7ec78c021 100644
--- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_CUDASPACE_HPP
 #define KOKKOS_CUDASPACE_HPP
 
@@ -98,6 +107,10 @@ class CudaSpace {
   ~CudaSpace()                               = default;
 
   /**\brief  Allocate untracked memory in the cuda space */
+  void* allocate(const Cuda& exec_space, const size_t arg_alloc_size) const;
+  void* allocate(const Cuda& exec_space, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
   void* allocate(const size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -111,6 +124,11 @@ class CudaSpace {
  private:
   template <class, class, class, class>
   friend class Kokkos::Experimental::LogicalMemorySpace;
+  void* impl_allocate(const Cuda& exec_space, const char* arg_label,
+                      const size_t arg_alloc_size,
+                      const size_t arg_logical_size = 0,
+                      const Kokkos::Tools::SpaceHandle =
+                          Kokkos::Tools::make_space_handle(name())) const;
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
                       const Kokkos::Tools::SpaceHandle =
@@ -574,11 +592,50 @@ class SharedAllocationRecord<Kokkos::CudaSpace, void>
   ~SharedAllocationRecord();
   SharedAllocationRecord() = default;
 
+  // This constructor does not forward to the one without exec_space arg
+  // in order to work around https://github.com/kokkos/kokkos/issues/5258
+  // This constructor is templated so I can't just put it into the cpp file
+  // like the other constructor.
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
+#endif
+            Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                 arg_alloc_size),
+            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+            arg_label),
+        m_tex_obj(0),
+        m_space(arg_space) {
+
+    SharedAllocationHeader header;
+
+    this->base_t::_fill_host_accessible_header_info(header, arg_label);
+
+    // Copy to device memory
+    // workaround for issue with NVCC and MSVC
+    // https://github.com/kokkos/kokkos/issues/5258
+    deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header);
+  }
+
+  SharedAllocationRecord(
+      const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+
   SharedAllocationRecord(
       const Kokkos::CudaSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 
+  // helper function to work around MSVC+NVCC issue
+  // https://github.com/kokkos/kokkos/issues/5258
+  static void deep_copy_header_no_exec(void*, const void*);
+
  public:
   template <typename AliasType>
   inline ::cudaTextureObject_t attach_texture_object() {
@@ -625,6 +682,30 @@ class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
   ~SharedAllocationRecord();
   SharedAllocationRecord() = default;
 
+  // This constructor does not forward to the one without exec_space arg
+  // in order to work around https://github.com/kokkos/kokkos/issues/5258
+  // This constructor is templated so I can't just put it into the cpp file
+  // like the other constructor.
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
+      const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
+#endif
+            Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                 arg_alloc_size),
+            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+            arg_label),
+        m_tex_obj(0),
+        m_space(arg_space) {
+    this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                    arg_label);
+  }
+
   SharedAllocationRecord(
       const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
@@ -676,10 +757,34 @@ class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
   ~SharedAllocationRecord();
   SharedAllocationRecord() = default;
 
+  // This constructor does not forward to the one without exec_space arg
+  // in order to work around https://github.com/kokkos/kokkos/issues/5258
+  // This constructor is templated so I can't just put it into the cpp file
+  // like the other constructor.
+  template <typename ExecutionSpace>
   SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
       const Kokkos::CudaHostPinnedSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
-      const RecordBase::function_type arg_dealloc = &deallocate);
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
+                                    void>::s_root_record,
+#endif
+            Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                 arg_alloc_size),
+            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+            arg_label),
+        m_space(arg_space) {
+    this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                    arg_label);
+  }
+
+  SharedAllocationRecord(
+      const Kokkos::CudaHostPinnedSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
index 9e060b343..d45693819 100644
--- a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
+++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
@@ -43,6 +43,10 @@
 */
 #ifndef KOKKOS_DETECTION_IDIOM_HPP
 #define KOKKOS_DETECTION_IDIOM_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DETECTIONIDIOM
+#endif
 
 #include <impl/Kokkos_Utilities.hpp>  // void_t
 #include <type_traits>
@@ -113,4 +117,8 @@ inline constexpr bool is_detected_convertible_v =
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DETECTIONIDIOM
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_DETECTIONIDIOM
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
index c88c1ada1..4cd57bae1 100644
--- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_EXECPOLICY_HPP
 #define KOKKOS_EXECPOLICY_HPP
 
@@ -199,11 +208,10 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> {
   inline member_type chunk_size() const { return m_granularity; }
 
   /** \brief set chunk_size to a discrete value*/
-  inline RangePolicy set_chunk_size(int chunk_size_) const {
-    RangePolicy p        = *this;
-    p.m_granularity      = chunk_size_;
-    p.m_granularity_mask = p.m_granularity - 1;
-    return p;
+  inline RangePolicy& set_chunk_size(int chunk_size) {
+    m_granularity      = chunk_size;
+    m_granularity_mask = m_granularity - 1;
+    return *this;
   }
 
  private:
@@ -431,53 +439,49 @@ class TeamPolicyInternal : public Impl::PolicyTraits<Properties...> {
 };
 
 struct PerTeamValue {
-  int value;
-  PerTeamValue(int arg);
+  size_t value;
+  PerTeamValue(size_t arg);
 };
 
 struct PerThreadValue {
-  int value;
-  PerThreadValue(int arg);
+  size_t value;
+  PerThreadValue(size_t arg);
 };
 
 template <class iType, class... Args>
 struct ExtractVectorLength {
   static inline iType value(
-      typename std::enable_if<std::is_integral<iType>::value, iType>::type val,
-      Args...) {
+      std::enable_if_t<std::is_integral<iType>::value, iType> val, Args...) {
     return val;
   }
-  static inline
-      typename std::enable_if<!std::is_integral<iType>::value, int>::type
-      value(
-          typename std::enable_if<!std::is_integral<iType>::value, iType>::type,
-          Args...) {
+  static inline std::enable_if_t<!std::is_integral<iType>::value, int> value(
+      std::enable_if_t<!std::is_integral<iType>::value, iType>, Args...) {
     return 1;
   }
 };
 
 template <class iType, class... Args>
-inline typename std::enable_if<std::is_integral<iType>::value, iType>::type
+inline std::enable_if_t<std::is_integral<iType>::value, iType>
 extract_vector_length(iType val, Args...) {
   return val;
 }
 
 template <class iType, class... Args>
-inline typename std::enable_if<!std::is_integral<iType>::value, int>::type
+inline std::enable_if_t<!std::is_integral<iType>::value, int>
 extract_vector_length(iType, Args...) {
   return 1;
 }
 
 }  // namespace Impl
 
-Impl::PerTeamValue PerTeam(const int& arg);
-Impl::PerThreadValue PerThread(const int& arg);
+Impl::PerTeamValue PerTeam(const size_t& arg);
+Impl::PerThreadValue PerThread(const size_t& arg);
 
 struct ScratchRequest {
   int level;
 
-  int per_team;
-  int per_thread;
+  size_t per_team;
+  size_t per_thread;
 
   inline ScratchRequest(const int& level_,
                         const Impl::PerTeamValue& team_value) {
@@ -813,7 +817,7 @@ KOKKOS_INLINE_FUNCTION_DELETED
 template <typename iType1, typename iType2, class TeamMemberType,
           class _never_use_this_overload>
 KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, TeamMemberType>
+    std::common_type_t<iType1, iType2>, TeamMemberType>
 TeamThreadRange(const TeamMemberType&, const iType1& begin,
                 const iType2& end) = delete;
 
@@ -839,7 +843,7 @@ KOKKOS_INLINE_FUNCTION_DELETED
 template <typename iType1, typename iType2, class TeamMemberType,
           class _never_use_this_overload>
 KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, TeamMemberType>
+    std::common_type_t<iType1, iType2>, TeamMemberType>
 TeamVectorRange(const TeamMemberType&, const iType1& begin,
                 const iType2& end) = delete;
 
@@ -858,14 +862,14 @@ KOKKOS_INLINE_FUNCTION_DELETED
 template <typename iType1, typename iType2, class TeamMemberType,
           class _never_use_this_overload>
 KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, TeamMemberType>
+    std::common_type_t<iType1, iType2>, TeamMemberType>
 ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin,
                   const iType2& arg_end) = delete;
 
 namespace Impl {
 
 template <typename FunctorType, typename TagType,
-          bool HasTag = !std::is_same<TagType, void>::value>
+          bool HasTag = !std::is_void<TagType>::value>
 struct ParallelConstructName;
 
 template <typename FunctorType, typename TagType>
diff --git a/packages/kokkos/core/src/Kokkos_Extents.hpp b/packages/kokkos/core/src/Kokkos_Extents.hpp
index 683b76e1f..c51d663ce 100644
--- a/packages/kokkos/core/src/Kokkos_Extents.hpp
+++ b/packages/kokkos/core/src/Kokkos_Extents.hpp
@@ -41,6 +41,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_KOKKOS_EXTENTS_HPP
 #define KOKKOS_KOKKOS_EXTENTS_HPP
 
@@ -98,9 +107,8 @@ struct _parse_impl {
 // We have to treat the case of int**[x] specially, since it *doesn't* go
 // backwards
 template <class T, ptrdiff_t... ExtentSpec>
-struct _parse_impl<
-    T*, Kokkos::Experimental::Extents<ExtentSpec...>,
-    typename std::enable_if<_all_remaining_extents_dynamic<T>::value>::type>
+struct _parse_impl<T*, Kokkos::Experimental::Extents<ExtentSpec...>,
+                   std::enable_if_t<_all_remaining_extents_dynamic<T>::value>>
     : _parse_impl<T, Kokkos::Experimental::Extents<
                          Kokkos::Experimental::dynamic_extent, ExtentSpec...>> {
 };
@@ -109,7 +117,7 @@ struct _parse_impl<
 template <class T, ptrdiff_t... ExtentSpec>
 struct _parse_impl<
     T*, Kokkos::Experimental::Extents<ExtentSpec...>,
-    typename std::enable_if<!_all_remaining_extents_dynamic<T>::value>::type> {
+    std::enable_if_t<!_all_remaining_extents_dynamic<T>::value>> {
   using _next = Kokkos::Experimental::AppendExtent<
       typename _parse_impl<T, Kokkos::Experimental::Extents<ExtentSpec...>,
                            void>::type,
diff --git a/packages/kokkos/core/src/Kokkos_Future.hpp b/packages/kokkos/core/src/Kokkos_Future.hpp
index b163bd1fc..4da6c2b5d 100644
--- a/packages/kokkos/core/src/Kokkos_Future.hpp
+++ b/packages/kokkos/core/src/Kokkos_Future.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_FUTURE_HPP
 #define KOKKOS_FUTURE_HPP
 
@@ -155,13 +164,13 @@ class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> {
   KOKKOS_INLINE_FUNCTION BasicFuture(
       BasicFuture<T, S>&& rhs) noexcept  // NOLINT(google-explicit-constructor)
       : m_task(std::move(rhs.m_task)) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Moved Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Moved Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Moved Futures must have the same value_type");
 
     // reference counts are unchanged, since this is a move
     rhs.m_task = nullptr;
@@ -172,13 +181,13 @@ class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> {
       BasicFuture<T, S> const& rhs)  // NOLINT(google-explicit-constructor)
                                      //: m_task(rhs.m_task)
       : m_task(nullptr) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Copied Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Copied Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Copied Futures must have the same value_type");
 
     *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task;
     if (m_task) m_task->increment_reference_count();
@@ -186,13 +195,13 @@ class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> {
 
   template <class T, class S>
   KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S> const& rhs) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     if (m_task != rhs.m_task) {
       clear();
@@ -207,13 +216,13 @@ class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> {
 
   template <class T, class S>
   KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S>&& rhs) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     if (m_task != rhs.m_task) {
       clear();
@@ -361,13 +370,13 @@ class BasicFuture {
   KOKKOS_INLINE_FUNCTION BasicFuture(
       BasicFuture<T, S>&& rhs) noexcept  // NOLINT(google-explicit-constructor)
       : m_task(rhs.m_task) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     rhs.m_task = 0;
   }
@@ -376,26 +385,26 @@ class BasicFuture {
   KOKKOS_INLINE_FUNCTION BasicFuture(
       BasicFuture<T, S> const& rhs)  // NOLINT(google-explicit-constructor)
       : m_task(nullptr) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     if (rhs.m_task) queue_type::assign(&m_task, rhs.m_task);
   }
 
   template <class T, class S>
   KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S> const& rhs) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     if (m_task || rhs.m_task) queue_type::assign(&m_task, rhs.m_task);
     return *this;
@@ -403,13 +412,13 @@ class BasicFuture {
 
   template <class T, class S>
   KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S>&& rhs) {
-    static_assert(std::is_same<scheduler_type, void>::value ||
+    static_assert(std::is_void<scheduler_type>::value ||
                       std::is_same<scheduler_type, S>::value,
                   "Assigned Futures must have the same scheduler");
 
-    static_assert(std::is_same<value_type, void>::value ||
-                      std::is_same<value_type, T>::value,
-                  "Assigned Futures must have the same value_type");
+    static_assert(
+        std::is_void<value_type>::value || std::is_same<value_type, T>::value,
+        "Assigned Futures must have the same value_type");
 
     clear();
     m_task     = rhs.m_task;
@@ -422,7 +431,7 @@ class BasicFuture {
   KOKKOS_INLINE_FUNCTION
   int is_ready() const noexcept {
     return (nullptr == m_task) ||
-           (((task_base*)task_base::LockTag) == m_task->m_wait);
+           (reinterpret_cast<task_base*>(task_base::LockTag) == m_task->m_wait);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -456,8 +465,8 @@ class ResolveFutureArgOrder {
  private:
   enum { Arg1_is_space = Kokkos::is_space<Arg1>::value };
   enum { Arg2_is_space = Kokkos::is_space<Arg2>::value };
-  enum { Arg1_is_value = !Arg1_is_space && !std::is_same<Arg1, void>::value };
-  enum { Arg2_is_value = !Arg2_is_space && !std::is_same<Arg2, void>::value };
+  enum { Arg1_is_value = !Arg1_is_space && !std::is_void<Arg1>::value };
+  enum { Arg2_is_value = !Arg2_is_space && !std::is_void<Arg2>::value };
 
   static_assert(!(Arg1_is_space && Arg2_is_space),
                 "Future cannot be given two spaces");
@@ -465,14 +474,13 @@ class ResolveFutureArgOrder {
   static_assert(!(Arg1_is_value && Arg2_is_value),
                 "Future cannot be given two value types");
 
-  using value_type = typename std::conditional<
-      Arg1_is_value, Arg1,
-      typename std::conditional<Arg2_is_value, Arg2, void>::type>::type;
+  using value_type =
+      std::conditional_t<Arg1_is_value, Arg1,
+                         std::conditional_t<Arg2_is_value, Arg2, void>>;
 
-  using execution_space = typename std::conditional<
+  using execution_space = typename std::conditional_t<
       Arg1_is_space, Arg1,
-      typename std::conditional<Arg2_is_space, Arg2,
-                                void>::type>::type::execution_space;
+      std::conditional_t<Arg2_is_space, Arg2, void>>::execution_space;
 
  public:
   using type = BasicFuture<value_type, TaskScheduler<execution_space>>;
diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp
index ef6057ae8..1f71665fb 100644
--- a/packages/kokkos/core/src/Kokkos_Graph.hpp
+++ b/packages/kokkos/core/src/Kokkos_Graph.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_GRAPH_HPP
 #define KOKKOS_GRAPH_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Error.hpp>  // KOKKOS_EXPECTS
@@ -188,4 +192,8 @@ Graph<ExecutionSpace> create_graph(Closure&& arg_closure) {
 #include <impl/Kokkos_GraphNodeImpl.hpp>
 #include <impl/Kokkos_Default_Graph_Impl.hpp>
 #include <Cuda/Kokkos_Cuda_Graph_Impl.hpp>
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH
+#endif
 #endif  // KOKKOS_GRAPH_HPP
diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp
index e34d1353e..6eab5ec8c 100644
--- a/packages/kokkos/core/src/Kokkos_GraphNode.hpp
+++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_KOKKOS_GRAPHNODE_HPP
 #define KOKKOS_KOKKOS_GRAPHNODE_HPP
 
@@ -225,7 +234,7 @@ class GraphNodeRef {
 
   template <
       class OtherKernel, class OtherPredecessor,
-      typename std::enable_if_t<
+      std::enable_if_t<
           // Not a copy/move constructor
           !std::is_same<GraphNodeRef, GraphNodeRef<execution_space, OtherKernel,
                                                    OtherPredecessor>>::value &&
@@ -256,12 +265,12 @@ class GraphNodeRef {
 
   template <
       class Policy, class Functor,
-      typename std::enable_if<
+      std::enable_if_t<
           // equivalent to:
           //   requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>>
           is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value,
           // --------------------
-          int>::type = 0>
+          int> = 0>
   auto then_parallel_for(std::string arg_name, Policy&& arg_policy,
                          Functor&& functor) const {
     //----------------------------------------
@@ -298,12 +307,12 @@ class GraphNodeRef {
 
   template <
       class Policy, class Functor,
-      typename std::enable_if<
+      std::enable_if_t<
           // equivalent to:
           //   requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>>
           is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value,
           // --------------------
-          int>::type = 0>
+          int> = 0>
   auto then_parallel_for(Policy&& policy, Functor&& functor) const {
     // needs to static assert constraint: DataParallelFunctor<Functor>
     return this->then_parallel_for("", (Policy &&) policy,
@@ -333,12 +342,12 @@ class GraphNodeRef {
 
   template <
       class Policy, class Functor, class ReturnType,
-      typename std::enable_if<
+      std::enable_if_t<
           // equivalent to:
           //   requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>>
           is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value,
           // --------------------
-          int>::type = 0>
+          int> = 0>
   auto then_parallel_reduce(std::string arg_name, Policy&& arg_policy,
                             Functor&& functor,
                             ReturnType&& return_value) const {
@@ -353,8 +362,7 @@ class GraphNodeRef {
     // needs static assertion of constraint:
     //   DataParallelReductionFunctor<Functor, ReturnType>
 
-    using policy_t = typename std::remove_cv<
-        typename std::remove_reference<Policy>::type>::type;
+    using policy_t = std::remove_cv_t<std::remove_reference_t<Policy>>;
     static_assert(
         std::is_same<typename policy_t::execution_space,
                      execution_space>::value,
@@ -380,8 +388,8 @@ class GraphNodeRef {
 
     //----------------------------------------
     // This is a disaster, but I guess it's not a my disaster to fix right now
-    using return_type_remove_cvref = typename std::remove_cv<
-        typename std::remove_reference<ReturnType>::type>::type;
+    using return_type_remove_cvref =
+        std::remove_cv_t<std::remove_reference_t<ReturnType>>;
     static_assert(Kokkos::is_view<return_type_remove_cvref>::value ||
                       Kokkos::is_reducer<return_type_remove_cvref>::value,
                   "Output argument to parallel reduce in a graph must be a "
@@ -416,12 +424,12 @@ class GraphNodeRef {
 
   template <
       class Policy, class Functor, class ReturnType,
-      typename std::enable_if<
+      std::enable_if_t<
           // equivalent to:
           //   requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>>
           is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value,
           // --------------------
-          int>::type = 0>
+          int> = 0>
   auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor,
                             ReturnType&& return_value) const {
     return this->then_parallel_reduce("", (Policy &&) arg_policy,
diff --git a/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp b/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp
index 1ba58e4c8..6f639658e 100644
--- a/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_KOKKOS_GRAPH_FWD_HPP
 #define KOKKOS_KOKKOS_GRAPH_FWD_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH_FWD
+#endif
 
 #include <Kokkos_Macros.hpp>
 
@@ -62,4 +66,8 @@ class GraphNodeRef;
 }  // end namespace Experimental
 }  // end namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH_FWD
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH_FWD
+#endif
 #endif  // KOKKOS_KOKKOS_GRAPH_FWD_HPP
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
index 47810f17a..0c5dbbdc2 100644
--- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HBWSPACE_HPP
 #define KOKKOS_HBWSPACE_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_HIP.hpp b/packages/kokkos/core/src/Kokkos_HIP.hpp
index 7241bb6c3..c387b5945 100644
--- a/packages/kokkos/core/src/Kokkos_HIP.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HIP_HPP
 #define KOKKOS_HIP_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
index 68869a607..8c195a0f3 100644
--- a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HIPSPACE_HPP
 #define KOKKOS_HIPSPACE_HPP
 
@@ -61,8 +70,8 @@
 #include <HIP/Kokkos_HIP_Error.hpp>  // HIP_SAFE_CALL
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 #include <hip/hip_runtime_api.h>
 /*--------------------------------------------------------------------------*/
@@ -213,6 +222,75 @@ struct Impl::is_hip_type_space<Experimental::HIPHostPinnedSpace>
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+namespace Kokkos {
+namespace Experimental {
+/** \brief  Memory that is accessible to HIP execution space
+ *          and host through HIP's memory page migration.
+ */
+class HIPManagedSpace {
+ public:
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is unified to both device and host via page migration
+   *  and therefore able to be used by HostSpace::execution_space and
+   *  DeviceSpace::execution_space.
+   */
+  //! tag this class as a kokkos memory space
+  using memory_space    = HIPManagedSpace;
+  using execution_space = Kokkos::Experimental::HIP;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = unsigned int;
+
+  /*--------------------------------*/
+
+  HIPManagedSpace();
+  HIPManagedSpace(HIPManagedSpace&& rhs)      = default;
+  HIPManagedSpace(const HIPManagedSpace& rhs) = default;
+  HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default;
+  HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default;
+  ~HIPManagedSpace()                                     = default;
+
+  /**\brief  Allocate untracked memory in the space */
+  void* allocate(const size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  int m_device;  ///< Which HIP device
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
+                      const size_t arg_logical_size = 0,
+                      const Kokkos::Tools::SpaceHandle =
+                          Kokkos::Tools::make_space_handle(name())) const;
+  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                       const size_t arg_alloc_size,
+                       const size_t arg_logical_size = 0,
+                       const Kokkos::Tools::SpaceHandle =
+                           Kokkos::Tools::make_space_handle(name())) const;
+
+ public:
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return "HIPManaged"; }
+
+  /*--------------------------------*/
+};
+}  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPManagedSpace>
+    : public std::true_type {};
+
+}  // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
 namespace Kokkos {
 namespace Impl {
 
@@ -239,6 +317,15 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::HIPManagedSpace> {
+  // HostSpace::execution_space != HIPManagedSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 
 template <>
@@ -257,6 +344,15 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
+                         Kokkos::Experimental::HIPManagedSpace> {
+  // HIPSpace::execution_space == HIPManagedSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 // HIPHostPinnedSpace::execution_space == HostSpace::execution_space
 // HIPHostPinnedSpace accessible to both HIP and Host
@@ -277,6 +373,42 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
+                         Kokkos::Experimental::HIPManagedSpace> {
+  enum : bool { assignable = false };  // different exec_space
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+// HIPManagedSpace::execution_space != HostSpace::execution_space
+// HIPManagedSpace accessible to both HIP and Host
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HIPManagedSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };  // HIPHostPinnedSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HIPManagedSpace,
+                         Kokkos::Experimental::HIPSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::HIPManagedSpace,
+                         Kokkos::Experimental::HIPHostPinnedSpace> {
+  enum : bool { assignable = false };  // different exec_space
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 };  // namespace Impl
 //----------------------------------------
 
@@ -433,6 +565,21 @@ class SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>
  protected:
   ~SharedAllocationRecord();
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec*/,
+      const Kokkos::Experimental::HIPSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::HIP& exec_space,
+      const Kokkos::Experimental::HIPSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+
   SharedAllocationRecord(
       const Kokkos::Experimental::HIPSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
@@ -463,11 +610,59 @@ class SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>
   ~SharedAllocationRecord();
   SharedAllocationRecord() = default;
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
   SharedAllocationRecord(
       const Kokkos::Experimental::HIPHostPinnedSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
+
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::HIPManagedSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::HIPManagedSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::HIPManagedSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::HIPManagedSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+
+#ifdef KOKKOS_ENABLE_DEBUG
+  static RecordBase s_root_record;
+#endif
+
+  const Kokkos::Experimental::HIPManagedSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default;
+
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::HIPManagedSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::HIPManagedSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -519,16 +714,15 @@ class HIP {
    * asynchronously, before the functor completes. This method does not return
    * until all dispatched functors on this device have completed.
    */
-  static void impl_static_fence();
-  static void impl_static_fence(const std::string&);
+  static void impl_static_fence(const std::string& name);
 
-  void fence() const;
-  void fence(const std::string&) const;
+  void fence(const std::string& name =
+                 "Kokkos::HIP::fence(): Unnamed Instance Fence") const;
 
   hipStream_t hip_stream() const;
 
   /// \brief Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   /// \brief Free any resources being consumed by the device.
   static void impl_finalize();
@@ -536,16 +730,10 @@ class HIP {
   /** \brief  Initialize the device.
    *
    */
-  struct SelectDevice {
-    int hip_device_id;
-    SelectDevice() : hip_device_id(0) {}
-    explicit SelectDevice(int id) : hip_device_id(id) {}
-  };
-
   int hip_device() const;
   static hipDeviceProp_t const& hip_device_prop();
 
-  static void impl_initialize(const SelectDevice = SelectDevice());
+  static void impl_initialize(InitializationSettings const&);
 
   static int impl_is_initialized();
 
@@ -579,18 +767,6 @@ struct DeviceTypeTraits<Kokkos::Experimental::HIP> {
 }  // namespace Tools
 
 namespace Impl {
-
-class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
- public:
-  HIPSpaceInitializer()  = default;
-  ~HIPSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
 template <class DT, class... DP>
 struct ZeroMemset<Kokkos::Experimental::HIP, DT, DP...> {
   ZeroMemset(const Kokkos::Experimental::HIP& exec_space,
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
index 9238ca30a..044e54fb2 100644
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ b/packages/kokkos/core/src/Kokkos_HPX.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HPX_HPP
 #define KOKKOS_HPX_HPP
 
@@ -65,11 +74,10 @@
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
@@ -89,7 +97,6 @@
 #include <iostream>
 #include <memory>
 #include <sstream>
-#include <stdexcept>
 #include <type_traits>
 #include <vector>
 
@@ -210,8 +217,6 @@ class HPX {
   enum class instance_mode { default_, independent };
 
  private:
-  instance_mode m_mode;
-
   static uint32_t m_active_parallel_region_count;
   static hpx::spinlock m_active_parallel_region_count_mutex;
   static hpx::condition_variable_any m_active_parallel_region_count_cond;
@@ -246,7 +251,6 @@ class HPX {
   HPX()
   noexcept
       : m_instance_id(impl_default_instance_id()),
-        m_mode(instance_mode::default_),
         m_buffer(m_default_instance_data.m_buffer),
         m_future(m_default_instance_data.m_future),
         m_future_mutex(m_default_instance_data.m_future_mutex) {}
@@ -255,7 +259,6 @@ class HPX {
       : m_instance_id(mode == instance_mode::independent
                           ? m_next_instance_id++
                           : impl_default_instance_id()),
-        m_mode(mode),
         m_independent_instance_data(mode == instance_mode::independent
                                         ? (new instance_data())
                                         : nullptr),
@@ -271,7 +274,6 @@ class HPX {
 
   HPX(hpx::shared_future<void> future)
       : m_instance_id(m_next_instance_id++),
-        m_mode(instance_mode::independent),
 
         m_independent_instance_data(new instance_data(future)),
         m_buffer(m_independent_instance_data->m_buffer),
@@ -286,9 +288,11 @@ class HPX {
   HPX() noexcept {}
 #endif
 
-  static void print_configuration(std::ostream &,
-                                  const bool /* verbose */ = false) {
-    std::cout << "HPX backend" << std::endl;
+  void print_configuration(std::ostream &os, bool /*verbose*/ = false) const {
+    os << "HPX backend\n";
+    os << "HPX Execution Space:\n";
+    os << "  KOKKOS_ENABLE_HPX: yes\n";
+    os << "\nHPX Runtime Configuration:\n";
   }
   uint32_t impl_instance_id() const noexcept { return m_instance_id; }
 
@@ -315,9 +319,9 @@ class HPX {
   }
 #endif
 
-  void impl_fence_instance(const std::string &name =
-                               "Kokkos::Experimental::HPX::impl_fence_instance:"
-                               " Unnamed Instance Fence") const {
+  void fence(
+      const std::string &name =
+          "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence") const {
     Kokkos::Tools::Experimental::Impl::profile_fence_event<
         Kokkos::Experimental::HPX>(
         name,
@@ -333,9 +337,7 @@ class HPX {
         });
   }
 
-  static void impl_fence_global(const std::string &name =
-                                    "Kokkos::Experimental::HPX::impl_fence_"
-                                    "global: Unnamed Global Fence") {
+  static void impl_static_fence(const std::string &name) {
     Kokkos::Tools::Experimental::Impl::profile_fence_event<
         Kokkos::Experimental::HPX>(
         name,
@@ -350,7 +352,7 @@ class HPX {
           // Reset the future to free variables that may have been captured in
           // parallel regions (however, we don't have access to futures from
           // instances other than the default instances, they will only be
-          // released by impl_fence_instance).
+          // released by fence).
           HPX().impl_get_future() = hpx::make_ready_future<void>();
 #endif
         });
@@ -360,9 +362,6 @@ class HPX {
     return hpx::execution::parallel_executor();
   }
 
-  void fence() const { impl_fence_instance(); }
-  void fence(const std::string &name) const { impl_fence_instance(name); }
-
   static bool is_asynchronous(HPX const & = HPX()) noexcept {
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
     return true;
@@ -371,6 +370,7 @@ class HPX {
 #endif
   }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   static std::vector<HPX> partition(...) {
     Kokkos::abort(
         "Kokkos::Experimental::HPX::partition_master: can't partition an HPX "
@@ -378,7 +378,6 @@ class HPX {
     return std::vector<HPX>();
   }
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   template <typename F>
   KOKKOS_DEPRECATED static void partition_master(
       F const &, int requested_num_partitions = 0, int = 0) {
@@ -391,8 +390,7 @@ class HPX {
 #endif
 
   static int concurrency();
-  static void impl_initialize(int thread_count);
-  static void impl_initialize();
+  static void impl_initialize(InitializationSettings const &);
   static bool impl_is_initialized() noexcept;
   static void impl_finalize();
 
@@ -507,17 +505,6 @@ struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
 
 namespace Impl {
 
-class HPXSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  HPXSpaceInitializer()  = default;
-  ~HPXSpaceInitializer() = default;
-  void initialize(const InitArguments &args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string &) final;
-  void print_configuration(std::ostream &msg, const bool detail) final;
-};
-
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
 template <typename Closure>
 inline void dispatch_execute_task(Closure *closure,
@@ -706,7 +693,7 @@ struct HPXTeamMember {
       const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
           &policy,
       const int team_rank, const int league_rank, void *scratch,
-      int scratch_size) noexcept
+      size_t scratch_size) noexcept
       : m_team_shared(scratch, scratch_size, scratch, scratch_size),
         m_league_size(policy.league_size()),
         m_league_rank(league_rank),
@@ -733,9 +720,8 @@ struct HPXTeamMember {
   }
 
   template <class ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(const ReducerType &) const {}
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(const ReducerType &) const {}
 
   template <typename Type>
   KOKKOS_INLINE_FUNCTION Type
@@ -860,7 +846,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
 
-  inline size_t scratch_size(const int &level, int team_size_ = -1) const {
+  size_t scratch_size(const int &level, int team_size_ = -1) const {
     if (team_size_ < 0) {
       team_size_ = m_team_size;
     }
@@ -1026,31 +1012,29 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  static typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_functor(const FunctorType &functor, const Member i) {
+  static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Member i) {
     functor(i);
   }
 
   template <class TagType>
-  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_functor(const FunctorType &functor, const Member i) {
+  static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Member i) {
     const TagType t{};
     functor(t, i);
   }
 
   template <class TagType>
-  static typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end) {
+  static std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
+      const FunctorType &functor, const Member i_begin, const Member i_end) {
     for (Member i = i_begin; i < i_end; ++i) {
       functor(i);
     }
   }
 
   template <class TagType>
-  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_functor_range(const FunctorType &functor, const Member i_begin,
-                        const Member i_end) {
+  static std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
+      const FunctorType &functor, const Member i_begin, const Member i_end) {
     const TagType t{};
     for (Member i = i_begin; i < i_end; ++i) {
       functor(t, i);
@@ -1156,7 +1140,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy)
       : m_functor(arg_functor),
         m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)) {}
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy &, const Functor &) {
     /**
@@ -1180,20 +1164,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
   using ReducerConditional =
       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-  using ValueInit  = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin  = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-  using ValueOps   = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
-  using value_type = typename Analysis::value_type;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
+  using value_type     = typename Analysis::value_type;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
@@ -1205,35 +1182,29 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   bool m_force_synchronous;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Member i,
-                      reference_type update) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Member i, reference_type update) {
     functor(i, update);
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Member i,
-                      reference_type update) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Member i, reference_type update) {
     const TagType t{};
     functor(t, i, update);
   }
 
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-  execute_functor_range(reference_type update, const Member i_begin,
-                        const Member i_end) const {
+  inline std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
+      reference_type update, const Member i_begin, const Member i_end) const {
     for (Member i = i_begin; i < i_end; ++i) {
       m_functor(i, update);
     }
   }
 
   template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  execute_functor_range(reference_type update, const Member i_begin,
-                        const Member i_end) const {
+  inline std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
+      reference_type update, const Member i_begin, const Member i_end) const {
     const TagType t{};
 
     for (Member i = i_begin; i < i_end; ++i) {
@@ -1307,7 +1278,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     }
 
     reference_type reference() const {
-      return ValueOps::reference(
+      return Analysis::Reducer::reference(
           reinterpret_cast<pointer_type>(m_value_buffer));
     }
   };
@@ -1316,10 +1287,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   void execute() const {
     if (m_policy.end() <= m_policy.begin()) {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        ValueFinal::final(ReducerConditional::select(m_functor, m_reducer),
-                          m_result_ptr);
+        typename Analysis::Reducer final_reducer(
+            &ReducerConditional::select(m_functor, m_reducer));
+
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
       }
       return;
     }
@@ -1331,6 +1303,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
         m_policy.space());
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     const std::size_t value_size =
         Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
 
@@ -1350,20 +1325,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     value_type_wrapper final_value(value_size);
     value_type_wrapper identity(value_size);
 
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                    final_value.pointer());
-    ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                    identity.pointer());
+    final_reducer.init(final_value.pointer());
+    final_reducer.init(identity.pointer());
 
     for_loop(par.on(exec).with(
                  static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
              m_policy.begin(), m_policy.end(),
              reduction(final_value, identity,
-                       [this](value_type_wrapper &a,
-                              value_type_wrapper &b) -> value_type_wrapper & {
-                         ValueJoin::join(
-                             ReducerConditional::select(m_functor, m_reducer),
-                             a.pointer(), b.pointer());
+                       [final_reducer](
+                           value_type_wrapper &a,
+                           value_type_wrapper &b) -> value_type_wrapper & {
+                         final_reducer.join(a.pointer(), b.pointer());
                          return a;
                        }),
              [this](Member i, value_type_wrapper &update) {
@@ -1382,9 +1354,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
     for_loop(
         par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-        [ this, &buffer ](const int t) noexcept {
-          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          reinterpret_cast<pointer_type>(buffer.get(t)));
+        [&buffer, final_reducer ](const int t) noexcept {
+          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
         });
 
     const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
@@ -1392,25 +1363,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     for_loop_strided(
         par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
         [this, &buffer, chunk_size](const Member i_begin) {
-          reference_type update =
-              ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
+          reference_type update = Analysis::Reducer::reference(
+              reinterpret_cast<pointer_type>(buffer.get(
                   Kokkos::Experimental::HPX::impl_hardware_thread_id())));
           const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
           execute_functor_range<WorkTag>(update, i_begin, i_end);
         });
 
     for (int i = 1; i < num_worker_threads; ++i) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
-                      reinterpret_cast<pointer_type>(buffer.get(0)),
-                      reinterpret_cast<pointer_type>(buffer.get(i)));
+      final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
+                         reinterpret_cast<pointer_type>(buffer.get(i)));
     }
 
     pointer_type final_value_ptr =
         reinterpret_cast<pointer_type>(buffer.get(0));
 #endif
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), final_value_ptr);
+    final_reducer.final(final_value_ptr);
 
     if (m_result_ptr != nullptr) {
       const int n = Analysis::value_count(
@@ -1426,9 +1395,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   inline ParallelReduce(
       const FunctorType &arg_functor, Policy arg_policy,
       const ViewType &arg_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void *> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
@@ -1453,19 +1422,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using WorkTag       = typename MDRangePolicy::work_tag;
   using WorkRange     = typename Policy::WorkRange;
   using Member        = typename Policy::member_type;
-  using Analysis      = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, FunctorType>;
   using ReducerConditional =
       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-  using ValueInit  = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin  = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-  using ValueOps   = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis       = FunctorAnalysis<FunctorPatternInterface::REDUCE,
+                                   MDRangePolicy, ReducerTypeFwd>;
+
   using pointer_type   = typename Analysis::pointer_type;
   using value_type     = typename Analysis::value_type;
   using reference_type = typename Analysis::reference_type;
@@ -1504,18 +1467,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
     auto exec = Kokkos::Experimental::HPX::impl_get_executor();
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
 #if KOKKOS_HPX_IMPLEMENTATION == 0
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &buffer](std::size_t t) {
-               ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                               reinterpret_cast<pointer_type>(buffer.get(t)));
-             });
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
+        [&buffer, final_reducer](std::size_t t) {
+          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+        });
 
     for_loop(par.on(exec).with(
                  static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
              m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) {
-               reference_type update = ValueOps::reference(
+               reference_type update = Analysis::Reducer::reference(
                    reinterpret_cast<pointer_type>(buffer.get(
                        Kokkos::Experimental::HPX::impl_hardware_thread_id())));
                iterate_type(m_mdr_policy, m_functor, update)(i);
@@ -1524,19 +1490,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 #elif KOKKOS_HPX_IMPLEMENTATION == 1
     using hpx::for_loop_strided;
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0),
-             num_worker_threads, [this, &buffer](const std::size_t t) {
-               ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                               reinterpret_cast<pointer_type>(buffer.get(t)));
-             });
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), std::size_t(0),
+        num_worker_threads, [&buffer, final_reducer](const std::size_t t) {
+          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+        });
 
     const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
 
     for_loop_strided(
         par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
         [this, &buffer, chunk_size](const Member i_begin) {
-          reference_type update =
-              ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
+          reference_type update = Analysis::Reducer::reference(
+              reinterpret_cast<pointer_type>(buffer.get(
                   Kokkos::Experimental::HPX::impl_hardware_thread_id())));
           const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
 
@@ -1547,14 +1513,11 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 #endif
 
     for (int i = 1; i < num_worker_threads; ++i) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
-                      reinterpret_cast<pointer_type>(buffer.get(0)),
-                      reinterpret_cast<pointer_type>(buffer.get(i)));
+      final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
+                         reinterpret_cast<pointer_type>(buffer.get(i)));
     }
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer),
-        reinterpret_cast<pointer_type>(buffer.get(0)));
+    final_reducer.final(reinterpret_cast<pointer_type>(buffer.get(0)));
 
     if (m_result_ptr != nullptr) {
       const int n = Analysis::value_count(
@@ -1570,12 +1533,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline ParallelReduce(
       const FunctorType &arg_functor, MDRangePolicy arg_policy,
       const ViewType &arg_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void *> = nullptr)
       : m_functor(arg_functor),
         m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
+        m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
         m_reducer(InvalidType()),
         m_result_ptr(arg_view.data()),
         m_force_synchronous(!arg_view.impl_track().has_record()) {}
@@ -1614,9 +1577,6 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   using Member    = typename Policy::member_type;
   using Analysis =
       FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-  using ValueInit      = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin      = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-  using ValueOps       = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
   using value_type     = typename Analysis::value_type;
@@ -1625,22 +1585,20 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Member i_begin,
-                            const Member i_end, reference_type update,
-                            const bool final) {
+  inline static std::enable_if_t<std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Member i_begin,
+                        const Member i_end, reference_type update,
+                        const bool final) {
     for (Member i = i_begin; i < i_end; ++i) {
       functor(i, update, final);
     }
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Member i_begin,
-                            const Member i_end, reference_type update,
-                            const bool final) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Member i_begin,
+                        const Member i_end, reference_type update,
+                        const bool final) {
     const TagType t{};
     for (Member i = i_begin; i < i_end; ++i) {
       functor(t, i, update, final);
@@ -1670,46 +1628,49 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     barrier<> bar(num_worker_threads);
     auto exec = Kokkos::Experimental::HPX::impl_get_executor();
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &bar, &buffer, num_worker_threads, value_count,
-              value_size](int t) {
-               reference_type update_sum = ValueInit::init(
-                   m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
-
-               const WorkRange range(m_policy, t, num_worker_threads);
-               execute_functor_range<WorkTag>(m_functor, range.begin(),
-                                              range.end(), update_sum, false);
-
-               bar.arrive_and_wait();
-
-               if (t == 0) {
-                 ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
-                                                buffer.get(0) + value_size));
-
-                 for (int i = 1; i < num_worker_threads; ++i) {
-                   pointer_type ptr_1_prev =
-                       reinterpret_cast<pointer_type>(buffer.get(i - 1));
-                   pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
-                       buffer.get(i - 1) + value_size);
-                   pointer_type ptr_2 = reinterpret_cast<pointer_type>(
-                       buffer.get(i) + value_size);
+    typename Analysis::Reducer final_reducer(&m_functor);
 
-                   for (int j = 0; j < value_count; ++j) {
-                     ptr_2[j] = ptr_2_prev[j];
-                   }
-
-                   ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
-                 }
-               }
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
+        [this, &bar, &buffer, num_worker_threads, value_count, value_size,
+         final_reducer](int t) {
+          reference_type update_sum =
+              final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+
+          const WorkRange range(m_policy, t, num_worker_threads);
+          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
+                                         update_sum, false);
+
+          bar.arrive_and_wait();
+
+          if (t == 0) {
+            final_reducer.init(
+                reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
+
+            for (int i = 1; i < num_worker_threads; ++i) {
+              pointer_type ptr_1_prev =
+                  reinterpret_cast<pointer_type>(buffer.get(i - 1));
+              pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
+                  buffer.get(i - 1) + value_size);
+              pointer_type ptr_2 =
+                  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
+
+              for (int j = 0; j < value_count; ++j) {
+                ptr_2[j] = ptr_2_prev[j];
+              }
+
+              final_reducer.join(ptr_2, ptr_1_prev);
+            }
+          }
 
-               bar.arrive_and_wait();
+          bar.arrive_and_wait();
 
-               reference_type update_base = ValueOps::reference(
-                   reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
+          reference_type update_base = Analysis::Reducer::reference(
+              reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
 
-               execute_functor_range<WorkTag>(m_functor, range.begin(),
-                                              range.end(), update_base, true);
-             });
+          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
+                                         update_base, true);
+        });
   }
 
   inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
@@ -1726,9 +1687,6 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using Member    = typename Policy::member_type;
   using Analysis =
       FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-  using ValueInit      = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin      = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-  using ValueOps       = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
   using value_type     = typename Analysis::value_type;
@@ -1738,22 +1696,20 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   ReturnType &m_returnvalue;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Member i_begin,
-                            const Member i_end, reference_type update,
-                            const bool final) {
+  inline static std::enable_if_t<std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Member i_begin,
+                        const Member i_end, reference_type update,
+                        const bool final) {
     for (Member i = i_begin; i < i_end; ++i) {
       functor(i, update, final);
     }
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Member i_begin,
-                            const Member i_end, reference_type update,
-                            const bool final) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Member i_begin,
+                        const Member i_end, reference_type update,
+                        const bool final) {
     const TagType t{};
     for (Member i = i_begin; i < i_end; ++i) {
       functor(t, i, update, final);
@@ -1783,50 +1739,53 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     barrier<> bar(num_worker_threads);
     auto exec = Kokkos::Experimental::HPX::impl_get_executor();
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &bar, &buffer, num_worker_threads, value_count,
-              value_size](int t) {
-               reference_type update_sum = ValueInit::init(
-                   m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
-
-               const WorkRange range(m_policy, t, num_worker_threads);
-               execute_functor_range<WorkTag>(m_functor, range.begin(),
-                                              range.end(), update_sum, false);
+    typename Analysis::Reducer final_reducer(&m_functor);
 
-               bar.arrive_and_wait();
-
-               if (t == 0) {
-                 ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
-                                                buffer.get(0) + value_size));
-
-                 for (int i = 1; i < num_worker_threads; ++i) {
-                   pointer_type ptr_1_prev =
-                       reinterpret_cast<pointer_type>(buffer.get(i - 1));
-                   pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
-                       buffer.get(i - 1) + value_size);
-                   pointer_type ptr_2 = reinterpret_cast<pointer_type>(
-                       buffer.get(i) + value_size);
-
-                   for (int j = 0; j < value_count; ++j) {
-                     ptr_2[j] = ptr_2_prev[j];
-                   }
-
-                   ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
-                 }
-               }
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
+        [this, &bar, &buffer, num_worker_threads, value_count, value_size,
+         final_reducer](int t) {
+          reference_type update_sum =
+              final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+
+          const WorkRange range(m_policy, t, num_worker_threads);
+          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
+                                         update_sum, false);
+
+          bar.arrive_and_wait();
+
+          if (t == 0) {
+            final_reducer.init(
+                reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
+
+            for (int i = 1; i < num_worker_threads; ++i) {
+              pointer_type ptr_1_prev =
+                  reinterpret_cast<pointer_type>(buffer.get(i - 1));
+              pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
+                  buffer.get(i - 1) + value_size);
+              pointer_type ptr_2 =
+                  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
+
+              for (int j = 0; j < value_count; ++j) {
+                ptr_2[j] = ptr_2_prev[j];
+              }
+
+              final_reducer.join(ptr_2, ptr_1_prev);
+            }
+          }
 
-               bar.arrive_and_wait();
+          bar.arrive_and_wait();
 
-               reference_type update_base = ValueOps::reference(
-                   reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
+          reference_type update_base = Analysis::Reducer::reference(
+              reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
 
-               execute_functor_range<WorkTag>(m_functor, range.begin(),
-                                              range.end(), update_base, true);
+          execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
+                                         update_base, true);
 
-               if (t == num_worker_threads - 1) {
-                 m_returnvalue = update_base;
-               }
-             });
+          if (t == num_worker_threads - 1) {
+            m_returnvalue = update_base;
+          }
+        });
   }
 
   inline ParallelScanWithTotal(const FunctorType &arg_functor,
@@ -1856,31 +1815,26 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const std::size_t m_shared;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Policy &policy,
-                      const int league_rank, char *local_buffer,
-                      const std::size_t local_buffer_size) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Policy &policy, const int league_rank,
+      char *local_buffer, const std::size_t local_buffer_size) {
     functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Policy &policy,
-                      const int league_rank, char *local_buffer,
-                      const std::size_t local_buffer_size) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Policy &policy, const int league_rank,
+      char *local_buffer, const std::size_t local_buffer_size) {
     const TagType t{};
     functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Policy &policy,
-                            const int league_rank_begin,
-                            const int league_rank_end, char *local_buffer,
-                            const std::size_t local_buffer_size) {
+  inline static std::enable_if_t<std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Policy &policy,
+                        const int league_rank_begin, const int league_rank_end,
+                        char *local_buffer,
+                        const std::size_t local_buffer_size) {
     for (int league_rank = league_rank_begin; league_rank < league_rank_end;
          ++league_rank) {
       functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
@@ -1888,12 +1842,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Policy &policy,
-                            const int league_rank_begin,
-                            const int league_rank_end, char *local_buffer,
-                            const std::size_t local_buffer_size) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Policy &policy,
+                        const int league_rank_begin, const int league_rank_end,
+                        char *local_buffer,
+                        const std::size_t local_buffer_size) {
     const TagType t{};
     for (int league_rank = league_rank_begin; league_rank < league_rank_end;
          ++league_rank) {
@@ -1962,22 +1915,15 @@ template <class FunctorType, class ReducerType, class... Properties>
 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      ReducerType, Kokkos::Experimental::HPX> {
  private:
-  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+  using Policy  = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
   using Member  = typename Policy::member_type;
   using WorkTag = typename Policy::work_tag;
   using ReducerConditional =
       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                          FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-  using ValueInit  = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin  = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-  using ValueOps   = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
   using value_type     = typename Analysis::value_type;
@@ -1992,36 +1938,30 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   bool m_force_synchronous;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Policy &policy,
-                      const int league_rank, char *local_buffer,
-                      const std::size_t local_buffer_size,
-                      reference_type update) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Policy &policy, const int league_rank,
+      char *local_buffer, const std::size_t local_buffer_size,
+      reference_type update) {
     functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
             update);
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor(const FunctorType &functor, const Policy &policy,
-                      const int league_rank, char *local_buffer,
-                      const std::size_t local_buffer_size,
-                      reference_type update) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
+      const FunctorType &functor, const Policy &policy, const int league_rank,
+      char *local_buffer, const std::size_t local_buffer_size,
+      reference_type update) {
     const TagType t{};
     functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
             update);
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Policy &policy,
-                            const int league_rank_begin,
-                            const int league_rank_end, char *local_buffer,
-                            const std::size_t local_buffer_size,
-                            reference_type update) {
+  inline static std::enable_if_t<std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Policy &policy,
+                        const int league_rank_begin, const int league_rank_end,
+                        char *local_buffer, const std::size_t local_buffer_size,
+                        reference_type update) {
     for (int league_rank = league_rank_begin; league_rank < league_rank_end;
          ++league_rank) {
       functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
@@ -2030,13 +1970,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_functor_range(const FunctorType &functor, const Policy &policy,
-                            const int league_rank_begin,
-                            const int league_rank_end, char *local_buffer,
-                            const std::size_t local_buffer_size,
-                            reference_type update) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value>
+  execute_functor_range(const FunctorType &functor, const Policy &policy,
+                        const int league_rank_begin, const int league_rank_end,
+                        char *local_buffer, const std::size_t local_buffer_size,
+                        reference_type update) {
     const TagType t{};
     for (int league_rank = league_rank_begin; league_rank < league_rank_end;
          ++league_rank) {
@@ -2050,10 +1988,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   void execute() const {
     if (m_policy.league_size() * m_policy.team_size() == 0) {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        ValueFinal::final(ReducerConditional::select(m_functor, m_reducer),
-                          m_result_ptr);
+        typename Analysis::Reducer final_reducer(
+            &ReducerConditional::select(m_functor, m_reducer));
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
       }
       return;
     }
@@ -2078,20 +2016,23 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     using hpx::execution::par;
     using hpx::execution::static_chunk_size;
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
 #if KOKKOS_HPX_IMPLEMENTATION == 0
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &buffer](const std::size_t t) {
-               ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                               reinterpret_cast<pointer_type>(buffer.get(t)));
-             });
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
+        [&buffer, final_reducer](const std::size_t t) {
+          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+        });
 
     for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0,
              m_policy.league_size(),
              [this, &buffer, value_size](const int league_rank) {
                std::size_t t =
                    Kokkos::Experimental::HPX::impl_hardware_thread_id();
-               reference_type update = ValueOps::reference(
+               reference_type update = Analysis::Reducer::reference(
                    reinterpret_cast<pointer_type>(buffer.get(t)));
 
                execute_functor<WorkTag>(m_functor, m_policy, league_rank,
@@ -2102,17 +2043,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 #elif KOKKOS_HPX_IMPLEMENTATION == 1
     using hpx::for_loop_strided;
 
-    for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
-             [this, &buffer](std::size_t const t) {
-               ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                               reinterpret_cast<pointer_type>(buffer.get(t)));
-             });
+    for_loop(
+        par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
+        [&buffer, final_reducer](std::size_t const t) {
+          final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
+        });
 
     for_loop_strided(
         par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(),
         [this, &buffer, value_size](int const league_rank_begin) {
           std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
-          reference_type update = ValueOps::reference(
+          reference_type update = Analysis::Reducer::reference(
               reinterpret_cast<pointer_type>(buffer.get(t)));
           const int league_rank_end =
               (std::min)(league_rank_begin + m_policy.chunk_size(),
@@ -2125,12 +2066,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
     const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0));
     for (int t = 1; t < num_worker_threads; ++t) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
-                      reinterpret_cast<pointer_type>(buffer.get(t)));
+      final_reducer.join(ptr, reinterpret_cast<pointer_type>(buffer.get(t)));
     }
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
+    final_reducer.final(ptr);
 
     if (m_result_ptr) {
       const int n = Analysis::value_count(
@@ -2143,12 +2082,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType &arg_functor, const Policy &arg_policy,
-      const ViewType &arg_result,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
+  ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy,
+                 const ViewType &arg_result,
+                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void *> = nullptr)
       : m_functor(arg_functor),
         m_league(arg_policy.league_size()),
         m_policy(arg_policy),
@@ -2186,10 +2124,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
 TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
                 const iType2 &i_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
       thread, iType(i_begin), iType(i_end));
 }
@@ -2204,10 +2142,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
 TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
                 const iType2 &i_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
       thread, iType(i_begin), iType(i_end));
 }
@@ -2222,10 +2160,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
 ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
                   const iType2 &i_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
       thread, iType(i_begin), iType(i_end));
 }
@@ -2384,8 +2322,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
         &loop_boundaries,
     const FunctorType &lambda) {
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = typename ValueTraits::value_type;
+  using value_type =
+      typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                     TeamPolicy<Experimental::HPX>,
+                                     FunctorType>::value_type;
 
   value_type scan_val = value_type();
 
@@ -2402,11 +2342,11 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
  *
  */
 template <typename iType, class FunctorType, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
-                      iType, Impl::HPXTeamMember> &loop_boundaries,
-                  const FunctorType &lambda, const ReducerType &reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(
+    const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
+        &loop_boundaries,
+    const FunctorType &lambda, const ReducerType &reducer) {
   typename ReducerType::value_type scan_val;
   reducer.init(scan_val);
 
diff --git a/packages/kokkos/core/src/Kokkos_Half.hpp b/packages/kokkos/core/src/Kokkos_Half.hpp
index 7382ffbd4..c1085876c 100644
--- a/packages/kokkos/core/src/Kokkos_Half.hpp
+++ b/packages/kokkos/core/src/Kokkos_Half.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_HALF_HPP_
 #define KOKKOS_HALF_HPP_
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
+#endif
 
 #include <type_traits>
 #include <Kokkos_Macros.hpp>
@@ -1029,4 +1033,8 @@ cast_from_bhalf(bhalf_t val) {
 #else
 #define KOKKOS_BHALF_T_IS_FLOAT false
 #endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
+#endif
 #endif  // KOKKOS_HALF_HPP_
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
index 034d31fca..1d67e2f9c 100644
--- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HOSTSPACE_HPP
 #define KOKKOS_HOSTSPACE_HPP
 
@@ -60,6 +69,7 @@
 #include <impl/Kokkos_Tools.hpp>
 
 #include "impl/Kokkos_HostSpace_deepcopy.hpp"
+#include <impl/Kokkos_MemorySpace.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -204,13 +214,12 @@ struct HostMirror {
   };
 
  public:
-  using Space = typename std::conditional<
+  using Space = std::conditional_t<
       keep_exe && keep_mem, S,
-      typename std::conditional<
-          keep_mem,
-          Kokkos::Device<Kokkos::HostSpace::execution_space,
-                         typename S::memory_space>,
-          Kokkos::HostSpace>::type>::type;
+      std::conditional_t<keep_mem,
+                         Kokkos::Device<Kokkos::HostSpace::execution_space,
+                                        typename S::memory_space>,
+                         Kokkos::HostSpace>>;
 };
 
 }  // namespace Impl
@@ -252,6 +261,28 @@ class SharedAllocationRecord<Kokkos::HostSpace, void>
       ;
   SharedAllocationRecord() = default;
 
+  // This constructor does not forward to the one without exec_space arg
+  // in order to work around https://github.com/kokkos/kokkos/issues/5258
+  // This constructor is templated so I can't just put it into the cpp file
+  // like the other constructor.
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &deallocate)
+      : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+            &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record,
+#endif
+            Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                 arg_alloc_size),
+            sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+            arg_label),
+        m_space(arg_space) {
+    this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr,
+                                                    arg_label);
+  }
+
   SharedAllocationRecord(
       const Kokkos::HostSpace& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
@@ -280,10 +311,17 @@ namespace Impl {
 
 template <class DT, class... DP>
 struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
-  ZeroMemset(const typename HostSpace::execution_space&,
+  ZeroMemset(const typename HostSpace::execution_space& exec,
              const View<DT, DP...>& dst,
-             typename View<DT, DP...>::const_value_type& value)
-      : ZeroMemset(dst, value) {}
+             typename View<DT, DP...>::const_value_type&) {
+    // Host spaces, except for HPX, are synchronous and we need to fence for HPX
+    // since we can't properly enqueue a std::memset otherwise.
+    // We can't use exec.fence() directly since we don't have a full definition
+    // of HostSpace here.
+    hostspace_fence(exec);
+    using ValueType = typename View<DT, DP...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
 
   ZeroMemset(const View<DT, DP...>& dst,
              typename View<DT, DP...>::const_value_type&) {
diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp
index cfd77ea50..78173c083 100644
--- a/packages/kokkos/core/src/Kokkos_Layout.hpp
+++ b/packages/kokkos/core/src/Kokkos_Layout.hpp
@@ -45,6 +45,15 @@
 /// \file Kokkos_Layout.hpp
 /// \brief Declaration of various \c MemoryLayout options.
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_LAYOUT_HPP
 #define KOKKOS_LAYOUT_HPP
 
@@ -84,9 +93,14 @@ struct LayoutLeft {
   LayoutLeft& operator=(LayoutLeft&&) = default;
 
   KOKKOS_INLINE_FUNCTION
-  explicit constexpr LayoutLeft(size_t N0 = 0, size_t N1 = 0, size_t N2 = 0,
-                                size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
-                                size_t N6 = 0, size_t N7 = 0)
+  explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
 
   friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) {
@@ -128,9 +142,14 @@ struct LayoutRight {
   LayoutRight& operator=(LayoutRight&&) = default;
 
   KOKKOS_INLINE_FUNCTION
-  explicit constexpr LayoutRight(size_t N0 = 0, size_t N1 = 0, size_t N2 = 0,
-                                 size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
-                                 size_t N6 = 0, size_t N7 = 0)
+  explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                                 size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
 
   friend bool operator==(const LayoutRight& left, const LayoutRight& right) {
@@ -177,7 +196,7 @@ struct LayoutStride {
     // Verify valid rank order:
     int check_input = ARRAY_LAYOUT_MAX_RANK < rank ? 0 : int(1 << rank) - 1;
     for (int r = 0; r < ARRAY_LAYOUT_MAX_RANK; ++r) {
-      tmp.dimension[r] = 0;
+      tmp.dimension[r] = KOKKOS_IMPL_CTOR_DEFAULT_ARG;
       tmp.stride[r]    = 0;
     }
     for (int r = 0; r < rank; ++r) {
@@ -195,12 +214,15 @@ struct LayoutStride {
   }
 
   KOKKOS_INLINE_FUNCTION
-  explicit constexpr LayoutStride(size_t N0 = 0, size_t S0 = 0, size_t N1 = 0,
-                                  size_t S1 = 0, size_t N2 = 0, size_t S2 = 0,
-                                  size_t N3 = 0, size_t S3 = 0, size_t N4 = 0,
-                                  size_t S4 = 0, size_t N5 = 0, size_t S5 = 0,
-                                  size_t N6 = 0, size_t S6 = 0, size_t N7 = 0,
-                                  size_t S7 = 0)
+  explicit constexpr LayoutStride(
+      size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S0 = 0,
+      size_t N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S1 = 0,
+      size_t N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S2 = 0,
+      size_t N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S3 = 0,
+      size_t N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S4 = 0,
+      size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0,
+      size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0,
+      size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3,
                                                           S4, S5, S6, S7} {}
 
@@ -234,9 +256,8 @@ template <typename LayoutTiledCheck, class Enable = void>
 struct is_layouttiled : std::false_type {};
 
 template <typename LayoutTiledCheck>
-struct is_layouttiled<
-    LayoutTiledCheck,
-    typename std::enable_if<LayoutTiledCheck::is_array_layout_tiled>::type>
+struct is_layouttiled<LayoutTiledCheck,
+                      std::enable_if_t<LayoutTiledCheck::is_array_layout_tiled>>
     : std::true_type {};
 
 namespace Experimental {
@@ -336,7 +357,7 @@ template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3,
           unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7>
 struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled<
     Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3,
-    ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+    ArgN4, ArgN5, ArgN6, ArgN7, true>> {
   static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left;
   static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left;
 };
@@ -345,7 +366,7 @@ template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3,
           unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7>
 struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled<
     Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3,
-    ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+    ArgN4, ArgN5, ArgN6, ArgN7, true>> {
   static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right;
   static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left;
 };
@@ -354,7 +375,7 @@ template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3,
           unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7>
 struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled<
     Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3,
-    ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+    ArgN4, ArgN5, ArgN6, ArgN7, true>> {
   static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left;
   static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right;
 };
@@ -363,7 +384,7 @@ template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3,
           unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7>
 struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled<
     Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3,
-    ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+    ArgN4, ArgN5, ArgN6, ArgN7, true>> {
   static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right;
   static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right;
 };
diff --git a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
index 6dcbe2790..d3ce354c2 100644
--- a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
+++ b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_LOGICALSPACES_HPP
 #define KOKKOS_LOGICALSPACES_HPP
 
@@ -98,9 +107,9 @@ class LogicalMemorySpace {
   /// parallel using the View's default execution space).
 
   using execution_space =
-      typename std::conditional<std::is_void<DefaultBaseExecutionSpace>::value,
-                                typename BaseSpace::execution_space,
-                                DefaultBaseExecutionSpace>::type;
+      std::conditional_t<std::is_void<DefaultBaseExecutionSpace>::value,
+                         typename BaseSpace::execution_space,
+                         DefaultBaseExecutionSpace>;
 
   using device_type = Kokkos::Device<execution_space, memory_space>;
 
@@ -247,6 +256,14 @@ class SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace<
   }
   SharedAllocationRecord() = default;
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
   SharedAllocationRecord(
       const SpaceType& arg_space, const std::string& arg_label,
       const size_t arg_alloc_size,
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
index 8c2d414a9..9dbd2de0c 100644
--- a/packages/kokkos/core/src/Kokkos_Macros.hpp
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -228,8 +228,8 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
-// FIXME Workaround for ICE with intel 17,18,19 in Trilinos
-#if (KOKKOS_COMPILER_INTEL <= 1900)
+// FIXME Workaround for ICE with intel 17,18,19,20,21 in Trilinos
+#if (KOKKOS_COMPILER_INTEL <= 2100)
 #define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
 #endif
 
@@ -264,12 +264,13 @@
 #define KOKKOS_ENABLE_ASM 1
 #endif
 
-#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION)
+#if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION)
 #if !defined(_WIN32)
-#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
-#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline))
+#define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \
+  inline __attribute__((always_inline))
+#define KOKKOS_IMPL_HOST_FORCEINLINE __attribute__((always_inline))
 #else
-#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline
+#define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION inline
 #endif
 #endif
 
@@ -320,9 +321,10 @@
 //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
 //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
-#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION)
-#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
-#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline))
+#if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION)
+#define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \
+  inline __attribute__((always_inline))
+#define KOKKOS_IMPL_HOST_FORCEINLINE __attribute__((always_inline))
 #endif
 
 #if !defined(KOKKOS_IMPL_ALIGN_PTR)
@@ -345,9 +347,10 @@
 #define KOKKOS_ENABLE_RFO_PREFETCH 1
 #endif
 
-#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION)
-#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
-#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline))
+#if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION)
+#define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \
+  inline __attribute__((always_inline))
+#define KOKKOS_IMPL_HOST_FORCEINLINE __attribute__((always_inline))
 #endif
 
 #define KOKKOS_RESTRICT __restrict__
@@ -380,12 +383,20 @@
 //----------------------------------------------------------------------------
 // Define function marking macros if compiler specific macros are undefined:
 
+#if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION)
+#define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION inline
+#endif
+
+#if !defined(KOKKOS_IMPL_HOST_FORCEINLINE)
+#define KOKKOS_IMPL_HOST_FORCEINLINE inline
+#endif
+
 #if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION)
-#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline
+#define KOKKOS_IMPL_FORCEINLINE_FUNCTION KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
 #endif
 
 #if !defined(KOKKOS_IMPL_FORCEINLINE)
-#define KOKKOS_IMPL_FORCEINLINE inline
+#define KOKKOS_IMPL_FORCEINLINE KOKKOS_IMPL_HOST_FORCEINLINE
 #endif
 
 #if !defined(KOKKOS_IMPL_INLINE_FUNCTION)
@@ -564,8 +575,9 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #endif
 
 #if !defined(KOKKOS_IF_ON_HOST) && !defined(KOKKOS_IF_ON_DEVICE)
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
-    defined(__SYCL_DEVICE_ONLY__)
+#if (defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||         \
+    (defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
+    (defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define KOKKOS_IF_ON_DEVICE(CODE) \
   { KOKKOS_IMPL_STRIP_PARENS(CODE) }
 #define KOKKOS_IF_ON_HOST(CODE) \
@@ -578,15 +590,6 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #endif
 #endif
 
-//----------------------------------------------------------------------------
-
-#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) || \
-    (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600)
-#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
-#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
-#endif
-#endif
-
 //----------------------------------------------------------------------------
 // If compiling with CUDA, we must use relocatable device code to enable the
 // task policy.
@@ -609,11 +612,11 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE
 #endif
 
-// Guard intel compiler version <= 1900
+// Guard intel compiler version 19 and older
 // intel error #2651: attribute does not apply to any entity
 // using <deprecated_type> KOKKOS_DEPRECATED = ...
 #if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) && \
-    (!defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL > 1900)
+    (!defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021)
 #define KOKKOS_DEPRECATED [[deprecated]]
 #define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]]
 #else
@@ -661,11 +664,9 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #undef __CUDA_ARCH__
 #endif
 
-#if (defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)) || \
-    (defined(KOKKOS_COMPILER_INTEL) && defined(_WIN32))
-#define KOKKOS_THREAD_LOCAL __declspec(thread)
-#else
-#define KOKKOS_THREAD_LOCAL __thread
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define KOKKOS_THREAD_LOCAL \
+  KOKKOS_DEPRECATED_WITH_COMMENT("Use thread_local instead!") thread_local
 #endif
 
 #if (defined(KOKKOS_IMPL_WINDOWS_CUDA) || defined(KOKKOS_COMPILER_MSVC)) && \
diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
index cbfbb9266..5e48595bf 100644
--- a/packages/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_MASTER_LOCK_HPP
 #define KOKKOS_MASTER_LOCK_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp
index c6b8c08dc..8f7b559e7 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp
@@ -43,6 +43,10 @@
 */
 #ifndef KOKKOS_MATHEMATICAL_CONSTANTS_HPP
 #define KOKKOS_MATHEMATICAL_CONSTANTS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <type_traits>
@@ -82,4 +86,8 @@ KOKKOS_IMPL_MATH_CONSTANT(phi,        1.618033988749894848204586834365638118L);
 
 }  // namespace Experimental
 }  // namespace Kokkos
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 6ee8d7745..6942f8495 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -44,9 +44,14 @@
 
 #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_HPP
 #define KOKKOS_MATHEMATICAL_FUNCTIONS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <cmath>
+#include <cstdlib>
 #include <type_traits>
 
 #ifdef KOKKOS_ENABLE_SYCL
@@ -76,29 +81,40 @@ struct promote<float> {
 };
 template <class T>
 using promote_t = typename promote<T>::type;
-template <class T, class U>
+template <class T, class U,
+          bool = std::is_arithmetic<T>::value&& std::is_arithmetic<U>::value>
 struct promote_2 {
   using type = decltype(promote_t<T>() + promote_t<U>());
 };
 template <class T, class U>
+struct promote_2<T, U, false> {};
+template <class T, class U>
 using promote_2_t = typename promote_2<T, U>::type;
 }  // namespace Impl
 
-namespace Experimental {
-
 // NOTE long double overloads are not available on the device
 
 #if defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl
 #else
-#if defined(KOKKOS_COMPILER_NVCC) && defined(__GNUC__) && (__GNUC__ < 6) && \
-    !defined(__clang__)
+#if (defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC)) && \
+    defined(__GNUC__) && (__GNUC__ < 6) && !defined(__clang__)
 #define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
 #else
 #define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std
 #endif
 #endif
 
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3)
+#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \
+    USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE)                      \
+  USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE
+#else
+#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \
+    USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE)                      \
+  /* nothing */
+#endif
+
 #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
   KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
@@ -125,7 +141,13 @@ namespace Experimental {
   FUNC(T x) {                                                                 \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
     return FUNC(static_cast<double>(x));                                      \
-  }
+  }                                                                           \
+  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(              \
+      namespace Experimental {                                                \
+        using ::Kokkos::FUNC;                                                 \
+        using ::Kokkos::FUNC##f;                                              \
+        using ::Kokkos::FUNC##l;                                              \
+      })
 
 // isinf, isnan, and isinfinite do not work on Windows with CUDA with std::
 // getting warnings about calling host function in device function then
@@ -142,7 +164,9 @@ namespace Experimental {
   KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
   FUNC(T x) {                                                               \
     return ::FUNC(static_cast<double>(x));                                  \
-  }
+  }                                                                         \
+  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(            \
+      namespace Experimental { using ::Kokkos::FUNC; })
 #else
 #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
   KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
@@ -162,7 +186,9 @@ namespace Experimental {
   FUNC(T x) {                                                               \
     using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
     return FUNC(static_cast<double>(x));                                    \
-  }
+  }                                                                         \
+  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(            \
+      namespace Experimental { using ::Kokkos::FUNC; })
 #endif
 
 #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                          \
@@ -208,20 +234,35 @@ namespace Experimental {
     static_assert(std::is_same<Promoted, long double>::value, "");      \
     using std::FUNC;                                                    \
     return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));    \
-  }
-
+  }                                                                     \
+  KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(        \
+      namespace Experimental {                                          \
+        using ::Kokkos::FUNC;                                           \
+        using ::Kokkos::FUNC##f;                                        \
+        using ::Kokkos::FUNC##l;                                        \
+      })
 // Basic operations
 KOKKOS_INLINE_FUNCTION int abs(int n) {
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
   return abs(n);
 }
 KOKKOS_INLINE_FUNCTION long abs(long n) {
+// FIXME_NVHPC ptxas fatal   : unresolved extern function 'labs'
+#ifdef KOKKOS_COMPILER_NVHPC
+  return n > 0 ? n : -n;
+#else
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
   return abs(n);
+#endif
 }
 KOKKOS_INLINE_FUNCTION long long abs(long long n) {
+// FIXME_NVHPC ptxas fatal   : unresolved extern function 'labs'
+#ifdef KOKKOS_COMPILER_NVHPC
+  return n > 0 ? n : -n;
+#else
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
   return abs(n);
+#endif
 }
 KOKKOS_INLINE_FUNCTION float abs(float x) {
   using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
@@ -235,11 +276,15 @@ inline long double abs(long double x) {
   using std::abs;
   return abs(x);
 }
+KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(
+    namespace Experimental { using ::Kokkos::abs; })
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
-KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
+// remquo
+// fma
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
 #ifndef KOKKOS_ENABLE_SYCL
 KOKKOS_INLINE_FUNCTION float nanf(char const* arg) { return ::nanf(arg); }
@@ -253,19 +298,46 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
 KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 inline long double nanl(char const* arg) { return ::nanl(arg); }
-// Power functions
-KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
-KOKKOS_IMPL_MATH_UNARY_FUNCTION(sqrt)
-KOKKOS_IMPL_MATH_UNARY_FUNCTION(cbrt)
-KOKKOS_IMPL_MATH_BINARY_FUNCTION(hypot)
+KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED(
+    namespace Experimental {
+      using ::Kokkos::nan;
+      using ::Kokkos::nanf;
+      using ::Kokkos::nanl;
+    })
 // Exponential functions
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp)
+// FIXME_NVHPC nvc++ has issues with exp2
+#ifndef KOKKOS_COMPILER_NVHPC
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp2)
+#else
+KOKKOS_INLINE_FUNCTION float exp2(float val) {
+  constexpr float ln2 = 0.693147180559945309417232121458176568L;
+  return exp(ln2 * val);
+}
+KOKKOS_INLINE_FUNCTION double exp2(double val) {
+  constexpr double ln2 = 0.693147180559945309417232121458176568L;
+  return exp(ln2 * val);
+}
+inline long double exp2(long double val) {
+  constexpr long double ln2 = 0.693147180559945309417232121458176568L;
+  return exp(ln2 * val);
+}
+template <class T>
+KOKKOS_INLINE_FUNCTION double exp2(T val) {
+  constexpr double ln2 = 0.693147180559945309417232121458176568L;
+  return exp(ln2 * static_cast<double>(val));
+}
+#endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(expm1)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(log)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(log10)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(log2)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(log1p)
+// Power functions
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(sqrt)
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(cbrt)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(hypot)
 // Trigonometric functions
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(sin)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(cos)
@@ -290,21 +362,51 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
-// FIXME_SYCL not available as of current SYCL specification v1.2.1
-#ifndef KOKKOS_ENABLE_SYCL
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(round)
+// lround
+// llround
+// FIXME_SYCL not available as of current SYCL 2020 specification (revision 4)
+#ifndef KOKKOS_ENABLE_SYCL  // FIXME_SYCL
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
 #endif
+// rint
+// lrint
+// llrint
+// Floating point manipulation functions
+// frexp
+// ldexp
+// modf
+// scalbn
+// scalbln
+// ilog
+KOKKOS_IMPL_MATH_UNARY_FUNCTION(logb)
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(nextafter)
+// nexttoward
+KOKKOS_IMPL_MATH_BINARY_FUNCTION(copysign)
 // Classification and comparison
+// fpclassify
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
+// isnormal
+KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit)
+// isgreater
+// isgreaterequal
+// isless
+// islessequal
+// islessgreater
+// isunordered
 
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED
 #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
 
-}  // namespace Experimental
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
index 03c491c36..07da1dbd7 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
 #define KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHSPECFUNCTIONS
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <cmath>
@@ -62,12 +66,12 @@ template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType expint1(RealType x) {
   // This function is a conversion of the corresponding Fortran program in
   // S. Zhang & J. Jin "Computation of Special Functions" (Wiley, 1996).
+  using Kokkos::exp;
+  using Kokkos::fabs;
+  using Kokkos::log;
+  using Kokkos::pow;
   using Kokkos::Experimental::epsilon;
-  using Kokkos::Experimental::exp;
-  using Kokkos::Experimental::fabs;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::log;
-  using Kokkos::Experimental::pow;
 
   RealType e1;
 
@@ -114,14 +118,14 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
   //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
   //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
   //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
-  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymptotic expansion
   //  Error condition: abs(z^2) > 670 is a fatal overflow error
-  using Kokkos::Experimental::cos;
+  using Kokkos::cos;
+  using Kokkos::exp;
+  using Kokkos::fabs;
+  using Kokkos::sin;
   using Kokkos::Experimental::epsilon;
-  using Kokkos::Experimental::exp;
-  using Kokkos::Experimental::fabs;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::sin;
 
   using CmplxType = Kokkos::complex<RealType>;
 
@@ -248,7 +252,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
         if (z.real() < 0.0) cans = -cans;
       }       // end (abs(yp) < 6.0)
       else {  //(abs(YP)>=6.0)
-        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        // Asymptotic expansion for 0<=xp<=1 and abs(yp)>=6
         CmplxType rcz   = 0.5 / cz;
         CmplxType accum = CmplxType(1.0, 0.0);
         CmplxType term  = accum;
@@ -291,15 +295,15 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
   //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
   //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
   //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
-  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymptotic expansion
   // Error condition: abs(z^2) > 670 is a fatal overflow error when x<0
-  using Kokkos::Experimental::cos;
+  using Kokkos::cos;
+  using Kokkos::exp;
+  using Kokkos::fabs;
+  using Kokkos::isinf;
+  using Kokkos::sin;
   using Kokkos::Experimental::epsilon;
-  using Kokkos::Experimental::exp;
-  using Kokkos::Experimental::fabs;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::isinf;
-  using Kokkos::Experimental::sin;
 
   using CmplxType = Kokkos::complex<RealType>;
 
@@ -440,7 +444,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
           cans = cz * (1.0 + w) + rcz * CmplxType(s1, s2) / pi;
       }       // end (abs(yp) < 6.0)
       else {  //(abs(YP)>=6.0)
-        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        // Asymptotic expansion for 0<=xp<=1 and abs(yp)>=6
         CmplxType rcz   = 0.5 / cz;
         CmplxType accum = CmplxType(1.0, 0.0);
         CmplxType term  = accum;
@@ -486,8 +490,8 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j0(const CmplxType& z,
   //                       argument regions
   //         bw_start  --- Starting point for backward recurrence
   // Output:  cbj0      --- J0(z)
-  using Kokkos::Experimental::fabs;
-  using Kokkos::Experimental::pow;
+  using Kokkos::fabs;
+  using Kokkos::pow;
 
   CmplxType cbj0;
   constexpr auto pi    = Kokkos::Experimental::pi_v<RealType>;
@@ -574,9 +578,9 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y0(const CmplxType& z,
   //                           argument regions
   //             bw_start  --- Starting point for backward recurrence
   //    Output:  cby0      --- Y0(z)
-  using Kokkos::Experimental::fabs;
+  using Kokkos::fabs;
+  using Kokkos::pow;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::pow;
 
   constexpr auto inf = infinity<RealType>::value;
 
@@ -675,8 +679,8 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j1(const CmplxType& z,
   //                           argument regions
   //             bw_start  --- Starting point for backward recurrence
   //    Output:  cbj1      --- J1(z)
-  using Kokkos::Experimental::fabs;
-  using Kokkos::Experimental::pow;
+  using Kokkos::fabs;
+  using Kokkos::pow;
 
   CmplxType cbj1;
   constexpr auto pi     = Kokkos::Experimental::pi_v<RealType>;
@@ -767,9 +771,9 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
   //                           argument regions
   //             bw_start  --- Starting point for backward recurrence
   //    Output:  cby1      --- Y1(z)
-  using Kokkos::Experimental::fabs;
+  using Kokkos::fabs;
+  using Kokkos::pow;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::pow;
 
   constexpr auto inf = infinity<RealType>::value;
 
@@ -943,8 +947,8 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k0(const CmplxType& z,
   //                           argument regions
   //             bw_start  --- Starting point for backward recurrence
   //    Output:  cbk0      --- K0(z)
+  using Kokkos::pow;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::pow;
 
   constexpr auto inf = infinity<RealType>::value;
 
@@ -1089,8 +1093,8 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k1(const CmplxType& z,
   //                           argument regions
   //             bw_start  --- Starting point for backward recurrence
   //    Output:  cbk1      --- K1(z)
+  using Kokkos::pow;
   using Kokkos::Experimental::infinity;
-  using Kokkos::Experimental::pow;
 
   constexpr auto inf = infinity<RealType>::value;
 
@@ -1274,4 +1278,8 @@ KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h21(const CmplxType& z) {
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHSPECFUNCTIONS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHSPECFUNCTIONS
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
index 7dce3f478..368d49018 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_MEMORYPOOL_HPP
 #define KOKKOS_MEMORYPOOL_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
index e3cee93e2..079384f15 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_MEMORYTRAITS_HPP
 #define KOKKOS_MEMORYTRAITS_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp b/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp
index a82e13df7..6cb8d1669 100644
--- a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp
+++ b/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_MIN_MAX_CLAMP_HPP
 #define KOKKOS_MIN_MAX_CLAMP_HPP
 
@@ -51,7 +60,6 @@
 #include <initializer_list>
 
 namespace Kokkos {
-namespace Experimental {
 
 // clamp
 template <class T>
@@ -223,7 +231,15 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair<T, T> minmax(
   return result;
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+namespace Experimental {
+using ::Kokkos::clamp;
+using ::Kokkos::max;
+using ::Kokkos::min;
+using ::Kokkos::minmax;
 }  // namespace Experimental
+#endif
+
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
index 67f017c69..e529aba85 100644
--- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_NUMERIC_TRAITS_HPP
 #define KOKKOS_NUMERIC_TRAITS_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NUMERICTRAITS
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <cfloat>
@@ -650,4 +654,8 @@ struct reduction_identity<long double> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NUMERICTRAITS
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NUMERICTRAITS
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
index 767e5b932..775b47044 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_OPENMP_HPP
 #define KOKKOS_OPENMP_HPP
 
@@ -62,8 +71,9 @@
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 #include <vector>
 
@@ -72,7 +82,7 @@
 namespace Kokkos {
 
 namespace Impl {
-class OpenMPExec;
+class OpenMPInternal;
 }
 
 /// \class OpenMP
@@ -95,8 +105,10 @@ class OpenMP {
   using size_type            = memory_space::size_type;
   using scratch_memory_space = ScratchMemorySpace<OpenMP>;
 
+  OpenMP();
+
   /// \brief Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&, const bool verbose = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   /// \brief is the instance running a parallel algorithm
   inline static bool in_parallel(OpenMP const& = OpenMP()) noexcept;
@@ -104,11 +116,10 @@ class OpenMP {
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
   ///  This is a no-op on OpenMP
-  static void impl_static_fence(OpenMP const&           = OpenMP(),
-                                const std::string& name = "") noexcept;
+  static void impl_static_fence(std::string const& name);
 
-  void fence() const;
-  void fence(const std::string& name) const;
+  void fence(std::string const& name =
+                 "Kokkos::OpenMP::fence: Unnamed Instance Fence") const;
 
   /// \brief Does the given instance return immediately after launching
   /// a parallel algorithm
@@ -116,6 +127,7 @@ class OpenMP {
   /// This always returns false on OpenMP
   inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept;
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /// \brief Partition the default instance into new instances without creating
   ///  new masters
   ///
@@ -129,7 +141,6 @@ class OpenMP {
   /// This is a no-op on OpenMP since a non default instance cannot be created
   static OpenMP create_instance(...);
 
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /// \brief Partition the default instance and call 'f' on each new 'master'
   /// thread
   ///
@@ -144,7 +155,7 @@ class OpenMP {
   // use UniqueToken
   static int concurrency();
 
-  static void impl_initialize(int thread_count = -1);
+  static void impl_initialize(InitializationSettings const&);
 
   /// \brief is the default execution space initialized for current 'master'
   /// thread
@@ -170,8 +181,23 @@ class OpenMP {
 
   static int impl_get_current_max_threads() noexcept;
 
+  Impl::OpenMPInternal* impl_internal_space_instance() const {
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    return m_space_instance;
+#else
+    return m_space_instance.get();
+#endif
+  }
+
   static constexpr const char* name() noexcept { return "OpenMP"; }
   uint32_t impl_instance_id() const noexcept { return 1; }
+
+ private:
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+  Impl::OpenMPInternal* m_space_instance;
+#else
+  Kokkos::Impl::HostSharedPtr<Impl::OpenMPInternal> m_space_instance;
+#endif
 };
 
 namespace Tools {
@@ -183,21 +209,6 @@ struct DeviceTypeTraits<OpenMP> {
 };
 }  // namespace Experimental
 }  // namespace Tools
-
-namespace Impl {
-
-class OpenMPSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  OpenMPSpaceInitializer()  = default;
-  ~OpenMPSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -220,7 +231,7 @@ struct MemorySpaceAccess<Kokkos::OpenMP::memory_space,
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
-#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
 #include <OpenMP/Kokkos_OpenMP_Team.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 373dc3d9c..637b4c08f 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_OPENMPTARGET_HPP
 #define KOKKOS_OPENMPTARGET_HPP
 
@@ -59,8 +68,8 @@
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -90,27 +99,27 @@ class OpenMPTarget {
 
   inline static bool in_parallel() { return omp_in_parallel(); }
 
-  static void fence();
-  static void fence(const std::string&);
+  static void fence(const std::string& name =
+                        "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
+
+  static void impl_static_fence(const std::string& name);
 
-  static void impl_static_fence();
-  static void impl_static_fence(const std::string&);
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
   //! Print configuration information to the given output stream.
-  void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   static const char* name();
 
   //! Free any resources being consumed by the device.
-  void impl_finalize();
+  static void impl_finalize();
 
   //! Has been initialized
   static int impl_is_initialized();
 
   //! Initialize, telling the CUDA run-time library which device to use.
-  void impl_initialize();
+  static void impl_initialize(InitializationSettings const&);
 
   inline Impl::OpenMPTargetInternal* impl_internal_space_instance() const {
     return m_space_instance;
@@ -124,6 +133,17 @@ class OpenMPTarget {
 };
 }  // namespace Experimental
 
+namespace Impl {
+template <>
+struct MemorySpaceAccess<
+    Kokkos::Experimental::OpenMPTargetSpace,
+    Kokkos::Experimental::OpenMPTarget::scratch_memory_space> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+}  // namespace Impl
+
 namespace Tools {
 namespace Experimental {
 template <>
@@ -137,20 +157,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> {
 }  // namespace Experimental
 }  // namespace Tools
 
-namespace Impl {
-
-class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  OpenMPTargetSpaceInitializer()  = default;
-  ~OpenMPTargetSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index 25c852717..b4897449c 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_OPENMPTARGETSPACE_HPP
 #define KOKKOS_OPENMPTARGETSPACE_HPP
 
@@ -113,14 +122,6 @@ struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
 };
 
 //----------------------------------------
-
-template <>
-struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
-                         Kokkos::Experimental::OpenMPTargetSpace> {
-  enum : bool { assignable = true };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
-};
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -161,13 +162,29 @@ class OpenMPTargetSpace {
 
   /**\brief  Allocate untracked memory in the space */
   void* allocate(const size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
 
   /**\brief  Deallocate untracked memory in the space */
-  void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
 
   static constexpr const char* name() { return "OpenMPTargetSpace"; }
 
  private:
+  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
+                      const size_t arg_logical_size = 0,
+                      const Kokkos::Tools::SpaceHandle =
+                          Kokkos::Tools::make_space_handle(name())) const;
+  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                       const size_t arg_alloc_size,
+                       const size_t arg_logical_size = 0,
+                       const Kokkos::Tools::SpaceHandle =
+                           Kokkos::Tools::make_space_handle(name())) const;
+
   friend class Kokkos::Impl::SharedAllocationRecord<
       Kokkos::Experimental::OpenMPTargetSpace, void>;
 };
@@ -208,6 +225,15 @@ class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>
   ~SharedAllocationRecord();
   SharedAllocationRecord() = default;
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
   SharedAllocationRecord(
       const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp
index 6045737aa..7e5b7ce25 100644
--- a/packages/kokkos/core/src/Kokkos_Pair.hpp
+++ b/packages/kokkos/core/src/Kokkos_Pair.hpp
@@ -48,6 +48,10 @@
 
 #ifndef KOKKOS_PAIR_HPP
 #define KOKKOS_PAIR_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PAIR
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <utility>
@@ -84,17 +88,28 @@ struct pair {
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION constexpr pair(first_type const& f,
-                                             second_type const& s)
-      : first(f), second(s) {}
+#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
+                              // constructors used in device code
+  KOKKOS_FORCEINLINE_FUNCTION
+#else
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+#endif
+  pair(first_type const& f, second_type const& s) : first(f), second(s) {}
 
   /// \brief Copy constructor.
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, V>& p)
-      : first(p.first), second(p.second) {}
+#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
+                              // constructors used in device code
+  KOKKOS_FORCEINLINE_FUNCTION
+#else
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+#endif
+  pair(const pair<U, V>& p)
+      : first(p.first), second(p.second) {
+  }
 
   /// \brief Copy constructor.
   ///
@@ -504,4 +519,8 @@ struct is_pair_like<std::pair<T, U>> : std::true_type {};
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PAIR
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PAIR
+#endif
 #endif  // KOKKOS_PAIR_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
index c12cd77d3..2b5e39d24 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -45,6 +45,15 @@
 /// \file Kokkos_Parallel.hpp
 /// \brief Declaration of parallel operators
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_PARALLEL_HPP
 #define KOKKOS_PARALLEL_HPP
 
@@ -58,7 +67,6 @@
 
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 
 #include <cstddef>
 #include <type_traits>
@@ -151,12 +159,11 @@ namespace Kokkos {
  * This compares to a single iteration \c iwork of a \c for loop.
  * If \c execution_space is not defined DefaultExecutionSpace will be used.
  */
-template <class ExecPolicy, class FunctorType>
-inline void parallel_for(
-    const ExecPolicy& policy, const FunctorType& functor,
-    const std::string& str = "",
-    typename std::enable_if<
-        Kokkos::is_execution_policy<ExecPolicy>::value>::type* = nullptr) {
+template <
+    class ExecPolicy, class FunctorType,
+    class Enable = std::enable_if_t<is_execution_policy<ExecPolicy>::value>>
+inline void parallel_for(const std::string& str, const ExecPolicy& policy,
+                         const FunctorType& functor) {
   uint64_t kpID = 0;
 
   ExecPolicy inner_policy = policy;
@@ -171,34 +178,51 @@ inline void parallel_for(
   Kokkos::Tools::Impl::end_parallel_for(inner_policy, functor, str, kpID);
 }
 
+template <class ExecPolicy, class FunctorType>
+inline void parallel_for(
+    const ExecPolicy& policy, const FunctorType& functor,
+    std::enable_if_t<is_execution_policy<ExecPolicy>::value>* = nullptr) {
+  Kokkos::parallel_for("", policy, functor);
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class ExecPolicy, class FunctorType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_for(
+    const ExecPolicy& policy, const FunctorType& functor,
+    const std::string& str,
+    std::enable_if_t<is_execution_policy<ExecPolicy>::value>* = nullptr) {
+  Kokkos::parallel_for(str, policy, functor);
+}
+#endif
+
 template <class FunctorType>
-inline void parallel_for(const size_t work_count, const FunctorType& functor,
-                         const std::string& str = "") {
+inline void parallel_for(const std::string& str, const size_t work_count,
+                         const FunctorType& functor) {
   using execution_space =
       typename Impl::FunctorPolicyExecutionSpace<FunctorType,
                                                  void>::execution_space;
   using policy = RangePolicy<execution_space>;
 
-  uint64_t kpID = 0;
-
   policy execution_policy = policy(0, work_count);
+  ::Kokkos::parallel_for(str, execution_policy, functor);
+}
 
-  Kokkos::Tools::Impl::begin_parallel_for(execution_policy, functor, str, kpID);
-
-  Kokkos::Impl::shared_allocation_tracking_disable();
-  Impl::ParallelFor<FunctorType, policy> closure(functor, execution_policy);
-  Kokkos::Impl::shared_allocation_tracking_enable();
-
-  closure.execute();
-
-  Kokkos::Tools::Impl::end_parallel_for(execution_policy, functor, str, kpID);
+template <class FunctorType>
+inline void parallel_for(const size_t work_count, const FunctorType& functor) {
+  ::Kokkos::parallel_for("", work_count, functor);
 }
 
-template <class ExecPolicy, class FunctorType>
-inline void parallel_for(const std::string& str, const ExecPolicy& policy,
-                         const FunctorType& functor) {
-  ::Kokkos::parallel_for(policy, functor, str);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class FunctorType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_for(const size_t work_count, const FunctorType& functor,
+                         const std::string& str) {
+  ::Kokkos::parallel_for(str, work_count, functor);
 }
+#endif
 
 }  // namespace Kokkos
 
@@ -245,8 +269,8 @@ namespace Kokkos {
 ///                     value_type& update,
 ///                     const bool final_pass) const;
 ///   void init (value_type& update) const;
-///   void join (volatile value_type& update,
-//               volatile const value_type& input) const
+///   void join (value_type& update,
+//               const value_type& input) const
 /// };
 /// \endcode
 ///
@@ -276,7 +300,7 @@ namespace Kokkos {
 ///   void init (value_type& update) const {
 ///     update = 0;
 ///   }
-///   void join (volatile value_type& update, volatile const value_type& input)
+///   void join (value_type& update, const value_type& input)
 ///   const {
 ///     update += input;
 ///   }
@@ -314,7 +338,7 @@ namespace Kokkos {
 ///   void init (value_type& update) const {
 ///     update = 0;
 ///   }
-///   void join (volatile value_type& update, volatile const value_type& input)
+///   void join (value_type& update, const value_type& input)
 ///   const {
 ///     update += input;
 ///   }
@@ -361,7 +385,7 @@ namespace Kokkos {
 ///   void init (value_type& update) const {
 ///     update = 0;
 ///   }
-///   void join (volatile value_type& update, volatile const value_type& input)
+///   void join (value_type& update, const value_type& input)
 ///   const {
 ///     update += input;
 ///   }
@@ -373,12 +397,11 @@ namespace Kokkos {
 /// };
 /// \endcode
 ///
-template <class ExecutionPolicy, class FunctorType>
-inline void parallel_scan(
-    const ExecutionPolicy& policy, const FunctorType& functor,
-    const std::string& str = "",
-    typename std::enable_if<
-        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
+template <class ExecutionPolicy, class FunctorType,
+          class Enable =
+              std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>>
+inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
+                          const FunctorType& functor) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -393,40 +416,59 @@ inline void parallel_scan(
   Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID);
 }
 
+template <class ExecutionPolicy, class FunctorType>
+inline void parallel_scan(
+    const ExecutionPolicy& policy, const FunctorType& functor,
+    std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>* = nullptr) {
+  ::Kokkos::parallel_scan("", policy, functor);
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class ExecutionPolicy, class FunctorType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_scan(
+    const ExecutionPolicy& policy, const FunctorType& functor,
+    const std::string& str,
+    std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>* = nullptr) {
+  ::Kokkos::parallel_scan(str, policy, functor);
+}
+#endif
+
 template <class FunctorType>
-inline void parallel_scan(const size_t work_count, const FunctorType& functor,
-                          const std::string& str = "") {
+inline void parallel_scan(const std::string& str, const size_t work_count,
+                          const FunctorType& functor) {
   using execution_space =
       typename Kokkos::Impl::FunctorPolicyExecutionSpace<FunctorType,
                                                          void>::execution_space;
 
   using policy = Kokkos::RangePolicy<execution_space>;
 
-  uint64_t kpID = 0;
   policy execution_policy(0, work_count);
-  Kokkos::Tools::Impl::begin_parallel_scan(execution_policy, functor, str,
-                                           kpID);
-  Kokkos::Impl::shared_allocation_tracking_disable();
-  Impl::ParallelScan<FunctorType, policy> closure(functor, execution_policy);
-  Kokkos::Impl::shared_allocation_tracking_enable();
-
-  closure.execute();
+  parallel_scan(str, execution_policy, functor);
+}
 
-  Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
+template <class FunctorType>
+inline void parallel_scan(const size_t work_count, const FunctorType& functor) {
+  ::Kokkos::parallel_scan("", work_count, functor);
 }
 
-template <class ExecutionPolicy, class FunctorType>
-inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
-                          const FunctorType& functor) {
-  ::Kokkos::parallel_scan(policy, functor, str);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class FunctorType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_scan(const size_t work_count, const FunctorType& functor,
+                          const std::string& str) {
+  ::Kokkos::parallel_scan(str, work_count, functor);
 }
+#endif
 
-template <class ExecutionPolicy, class FunctorType, class ReturnType>
-inline void parallel_scan(
-    const ExecutionPolicy& policy, const FunctorType& functor,
-    ReturnType& return_value, const std::string& str = "",
-    typename std::enable_if<
-        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
+template <class ExecutionPolicy, class FunctorType, class ReturnType,
+          class Enable =
+              std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>>
+inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
+                          const FunctorType& functor,
+                          ReturnType& return_value) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -444,10 +486,30 @@ inline void parallel_scan(
       "Kokkos::parallel_scan: fence due to result being a value, not a view");
 }
 
+template <class ExecutionPolicy, class FunctorType, class ReturnType>
+inline void parallel_scan(
+    const ExecutionPolicy& policy, const FunctorType& functor,
+    ReturnType& return_value,
+    std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>* = nullptr) {
+  ::Kokkos::parallel_scan("", policy, functor, return_value);
+}
+
+#ifdef KOKKOS_ENABLE_DISABLE_DEPRECATED_CODE_3
+template <class ExecutionPolicy, class FunctorType, class ReturnType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_scan(
+    const ExecutionPolicy& policy, const FunctorType& functor,
+    ReturnType& return_value, const std::string& str,
+    std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>* = nullptr) {
+  ::Kokkos::parallel_scan(str, policy, functor, return_value);
+}
+#endif
+
 template <class FunctorType, class ReturnType>
-inline void parallel_scan(const size_t work_count, const FunctorType& functor,
-                          ReturnType& return_value,
-                          const std::string& str = "") {
+inline void parallel_scan(const std::string& str, const size_t work_count,
+                          const FunctorType& functor,
+                          ReturnType& return_value) {
   using execution_space =
       typename Kokkos::Impl::FunctorPolicyExecutionSpace<FunctorType,
                                                          void>::execution_space;
@@ -455,29 +517,24 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
   using policy = Kokkos::RangePolicy<execution_space>;
 
   policy execution_policy(0, work_count);
-  uint64_t kpID = 0;
-  Kokkos::Tools::Impl::begin_parallel_scan(execution_policy, functor, str,
-                                           kpID);
-
-  Kokkos::Impl::shared_allocation_tracking_disable();
-  Impl::ParallelScanWithTotal<FunctorType, policy, ReturnType> closure(
-      functor, execution_policy, return_value);
-  Kokkos::Impl::shared_allocation_tracking_enable();
-
-  closure.execute();
-
-  Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
-
-  execution_space().fence(
-      "Kokkos::parallel_scan: fence after scan with return value");
+  parallel_scan(str, execution_policy, functor, return_value);
 }
 
-template <class ExecutionPolicy, class FunctorType, class ReturnType>
-inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
-                          const FunctorType& functor,
+template <class FunctorType, class ReturnType>
+inline void parallel_scan(const size_t work_count, const FunctorType& functor,
                           ReturnType& return_value) {
-  ::Kokkos::parallel_scan(policy, functor, return_value, str);
+  ::Kokkos::parallel_scan("", work_count, functor, return_value);
+}
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+template <class FunctorType, class ReturnType>
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use the overload taking the label as first argument instead!")
+inline void parallel_scan(const size_t work_count, const FunctorType& functor,
+                          ReturnType& return_value, const std::string& str) {
+  ::Kokkos::parallel_scan(str, work_count, functor, return_value);
 }
+#endif
 
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index abd5c39bb..9213383ac 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -42,38 +42,39 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_PARALLEL_REDUCE_HPP
 #define KOKKOS_PARALLEL_REDUCE_HPP
 
 #include <Kokkos_NumericTraits.hpp>
 #include <Kokkos_View.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools_Generic.hpp>
 #include <type_traits>
 #include <iostream>
 
 namespace Kokkos {
 
-template <class T, class Enable = void>
-struct is_reducer_type {
-  enum { value = 0 };
-};
-
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <class T>
-struct is_reducer_type<
-    T, typename std::enable_if<std::is_same<
-           typename std::remove_cv<T>::type,
-           typename std::remove_cv<typename T::reducer>::type>::value>::type> {
-  enum { value = 1 };
-};
+using is_reducer_type KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Use Kokkos::is_reducer instead!") = Kokkos::is_reducer<T>;
+#endif
 
 template <class Scalar, class Space>
 struct Sum {
  public:
   // Required
   using reducer    = Sum<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -93,11 +94,6 @@ struct Sum {
   KOKKOS_INLINE_FUNCTION
   void join(value_type& dest, const value_type& src) const { dest += src; }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest += src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::sum();
@@ -118,7 +114,7 @@ struct Prod {
  public:
   // Required
   using reducer    = Prod<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -138,11 +134,6 @@ struct Prod {
   KOKKOS_INLINE_FUNCTION
   void join(value_type& dest, const value_type& src) const { dest *= src; }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest *= src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::prod();
@@ -163,7 +154,7 @@ struct Min {
  public:
   // Required
   using reducer    = Min<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -185,11 +176,6 @@ struct Min {
     if (src < dest) dest = src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src < dest) dest = src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::min();
@@ -210,7 +196,7 @@ struct Max {
  public:
   // Required
   using reducer    = Max<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -232,11 +218,6 @@ struct Max {
     if (src > dest) dest = src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src > dest) dest = src;
-  }
-
   // Required
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
@@ -258,7 +239,7 @@ struct LAnd {
  public:
   // Required
   using reducer    = LAnd<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -279,11 +260,6 @@ struct LAnd {
     dest = dest && src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest = dest && src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::land();
@@ -304,7 +280,7 @@ struct LOr {
  public:
   // Required
   using reducer    = LOr<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -326,11 +302,6 @@ struct LOr {
     dest = dest || src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest = dest || src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::lor();
@@ -351,7 +322,7 @@ struct BAnd {
  public:
   // Required
   using reducer    = BAnd<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -373,11 +344,6 @@ struct BAnd {
     dest = dest & src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest = dest & src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::band();
@@ -398,7 +364,7 @@ struct BOr {
  public:
   // Required
   using reducer    = BOr<Scalar, Space>;
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -420,11 +386,6 @@ struct BOr {
     dest = dest | src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest = dest | src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val = reduction_identity<value_type>::bor();
@@ -450,19 +411,13 @@ struct ValLocScalar {
     val = rhs.val;
     loc = rhs.loc;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile ValLocScalar& rhs) volatile {
-    val = rhs.val;
-    loc = rhs.loc;
-  }
 };
 
 template <class Scalar, class Index, class Space>
 struct MinLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -489,11 +444,6 @@ struct MinLoc {
     if (src.val < dest.val) dest = src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.val < dest.val) dest = src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::min();
@@ -513,8 +463,8 @@ struct MinLoc {
 template <class Scalar, class Index, class Space>
 struct MaxLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -541,11 +491,6 @@ struct MaxLoc {
     if (src.val > dest.val) dest = src;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.val > dest.val) dest = src;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::max();
@@ -571,18 +516,12 @@ struct MinMaxScalar {
     min_val = rhs.min_val;
     max_val = rhs.max_val;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile MinMaxScalar& rhs) volatile {
-    min_val = rhs.min_val;
-    max_val = rhs.max_val;
-  }
 };
 
 template <class Scalar, class Space>
 struct MinMax {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
 
  public:
   // Required
@@ -614,16 +553,6 @@ struct MinMax {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_val = reduction_identity<scalar_type>::max();
@@ -652,21 +581,13 @@ struct MinMaxLocScalar {
     max_val = rhs.max_val;
     max_loc = rhs.max_loc;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile MinMaxLocScalar& rhs) volatile {
-    min_val = rhs.min_val;
-    min_loc = rhs.min_loc;
-    max_val = rhs.max_val;
-    max_loc = rhs.max_loc;
-  }
 };
 
 template <class Scalar, class Index, class Space>
 struct MinMaxLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -700,18 +621,6 @@ struct MinMaxLoc {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    }
-    if (src.max_val > dest.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_val = reduction_identity<scalar_type>::max();
@@ -740,8 +649,8 @@ struct MinMaxLoc {
 template <class Scalar, class Index, class Space>
 struct MaxFirstLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -772,15 +681,6 @@ struct MaxFirstLoc {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (dest.val < src.val) {
-      dest = src;
-    } else if (!(src.val < dest.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::max();
@@ -804,8 +704,8 @@ struct MaxFirstLoc {
 template <class Scalar, class Index, class ComparatorType, class Space>
 struct MaxFirstLocCustomComparator {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -840,15 +740,6 @@ struct MaxFirstLocCustomComparator {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (m_comp(dest.val, src.val)) {
-      dest = src;
-    } else if (!m_comp(src.val, dest.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::max();
@@ -871,8 +762,8 @@ struct MaxFirstLocCustomComparator {
 template <class Scalar, class Index, class Space>
 struct MinFirstLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -903,15 +794,6 @@ struct MinFirstLoc {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.val < dest.val) {
-      dest = src;
-    } else if (!(dest.val < src.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::min();
@@ -935,8 +817,8 @@ struct MinFirstLoc {
 template <class Scalar, class Index, class ComparatorType, class Space>
 struct MinFirstLocCustomComparator {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -971,15 +853,6 @@ struct MinFirstLocCustomComparator {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (m_comp(src.val, dest.val)) {
-      dest = src;
-    } else if (!m_comp(dest.val, src.val)) {
-      dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.val = reduction_identity<scalar_type>::min();
@@ -1002,8 +875,8 @@ struct MinFirstLocCustomComparator {
 template <class Scalar, class Index, class Space>
 struct MinMaxFirstLastLoc {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1043,23 +916,6 @@ struct MinMaxFirstLastLoc {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (src.min_val < dest.min_val) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    } else if (!(dest.min_val < src.min_val)) {
-      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
-    }
-
-    if (dest.max_val < src.max_val) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    } else if (!(src.max_val < dest.max_val)) {
-      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_val = ::Kokkos::reduction_identity<scalar_type>::max();
@@ -1085,8 +941,8 @@ struct MinMaxFirstLastLoc {
 template <class Scalar, class Index, class ComparatorType, class Space>
 struct MinMaxFirstLastLocCustomComparator {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1129,23 +985,6 @@ struct MinMaxFirstLastLocCustomComparator {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    if (m_comp(src.min_val, dest.min_val)) {
-      dest.min_val = src.min_val;
-      dest.min_loc = src.min_loc;
-    } else if (!m_comp(dest.min_val, src.min_val)) {
-      dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc;
-    }
-
-    if (m_comp(dest.max_val, src.max_val)) {
-      dest.max_val = src.max_val;
-      dest.max_loc = src.max_loc;
-    } else if (!m_comp(src.max_val, dest.max_val)) {
-      dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc;
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_val = ::Kokkos::reduction_identity<scalar_type>::max();
@@ -1173,17 +1012,12 @@ struct FirstLocScalar {
 
   KOKKOS_INLINE_FUNCTION
   void operator=(const FirstLocScalar& rhs) { min_loc_true = rhs.min_loc_true; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile FirstLocScalar& rhs) volatile {
-    min_loc_true = rhs.min_loc_true;
-  }
 };
 
 template <class Index, class Space>
 struct FirstLoc {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1212,13 +1046,6 @@ struct FirstLoc {
                             : dest.min_loc_true;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest.min_loc_true = (src.min_loc_true < dest.min_loc_true)
-                            ? src.min_loc_true
-                            : dest.min_loc_true;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.min_loc_true = ::Kokkos::reduction_identity<index_type>::min();
@@ -1243,17 +1070,12 @@ struct LastLocScalar {
 
   KOKKOS_INLINE_FUNCTION
   void operator=(const LastLocScalar& rhs) { max_loc_true = rhs.max_loc_true; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile LastLocScalar& rhs) volatile {
-    max_loc_true = rhs.max_loc_true;
-  }
 };
 
 template <class Index, class Space>
 struct LastLoc {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1282,13 +1104,6 @@ struct LastLoc {
                             : dest.max_loc_true;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest.max_loc_true = (src.max_loc_true > dest.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_loc_true = ::Kokkos::reduction_identity<index_type>::max();
@@ -1313,12 +1128,6 @@ struct StdIsPartScalar {
     min_loc_false = rhs.min_loc_false;
     max_loc_true  = rhs.max_loc_true;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile StdIsPartScalar& rhs) volatile {
-    min_loc_false = rhs.min_loc_false;
-    max_loc_true  = rhs.max_loc_true;
-  }
 };
 
 //
@@ -1327,7 +1136,7 @@ struct StdIsPartScalar {
 template <class Index, class Space>
 struct StdIsPartitioned {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1361,17 +1170,6 @@ struct StdIsPartitioned {
                              : src.min_loc_false;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest.max_loc_true = (dest.max_loc_true < src.max_loc_true)
-                            ? src.max_loc_true
-                            : dest.max_loc_true;
-
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.max_loc_true  = ::Kokkos::reduction_identity<index_type>::max();
@@ -1396,11 +1194,6 @@ struct StdPartPointScalar {
   void operator=(const StdPartPointScalar& rhs) {
     min_loc_false = rhs.min_loc_false;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile StdPartPointScalar& rhs) volatile {
-    min_loc_false = rhs.min_loc_false;
-  }
 };
 
 //
@@ -1409,7 +1202,7 @@ struct StdPartPointScalar {
 template <class Index, class Space>
 struct StdPartitionPoint {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -1439,13 +1232,6 @@ struct StdPartitionPoint {
                              : src.min_loc_false;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest.min_loc_false = (dest.min_loc_false < src.min_loc_false)
-                             ? dest.min_loc_false
-                             : src.min_loc_false;
-  }
-
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const {
     val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min();
@@ -1470,8 +1256,8 @@ struct ParallelReduceReturnValue;
 
 template <class ReturnType, class FunctorType>
 struct ParallelReduceReturnValue<
-    typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type,
-    ReturnType, FunctorType> {
+    std::enable_if_t<Kokkos::is_view<ReturnType>::value>, ReturnType,
+    FunctorType> {
   using return_type  = ReturnType;
   using reducer_type = InvalidType;
 
@@ -1488,10 +1274,10 @@ struct ParallelReduceReturnValue<
 
 template <class ReturnType, class FunctorType>
 struct ParallelReduceReturnValue<
-    typename std::enable_if<!Kokkos::is_view<ReturnType>::value &&
-                            (!std::is_array<ReturnType>::value &&
-                             !std::is_pointer<ReturnType>::value) &&
-                            !Kokkos::is_reducer_type<ReturnType>::value>::type,
+    std::enable_if_t<!Kokkos::is_view<ReturnType>::value &&
+                     (!std::is_array<ReturnType>::value &&
+                      !std::is_pointer<ReturnType>::value) &&
+                     !Kokkos::is_reducer<ReturnType>::value>,
     ReturnType, FunctorType> {
   using return_type =
       Kokkos::View<ReturnType, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
@@ -1507,10 +1293,10 @@ struct ParallelReduceReturnValue<
 
 template <class ReturnType, class FunctorType>
 struct ParallelReduceReturnValue<
-    typename std::enable_if<(std::is_array<ReturnType>::value ||
-                             std::is_pointer<ReturnType>::value)>::type,
+    std::enable_if_t<(std::is_array<ReturnType>::value ||
+                      std::is_pointer<ReturnType>::value)>,
     ReturnType, FunctorType> {
-  using return_type = Kokkos::View<typename std::remove_const<ReturnType>::type,
+  using return_type = Kokkos::View<std::remove_const_t<ReturnType>,
                                    Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
   using reducer_type = InvalidType;
@@ -1528,8 +1314,8 @@ struct ParallelReduceReturnValue<
 
 template <class ReturnType, class FunctorType>
 struct ParallelReduceReturnValue<
-    typename std::enable_if<Kokkos::is_reducer_type<ReturnType>::value>::type,
-    ReturnType, FunctorType> {
+    std::enable_if_t<Kokkos::is_reducer<ReturnType>::value>, ReturnType,
+    FunctorType> {
   using return_type  = ReturnType;
   using reducer_type = ReturnType;
   using value_type   = typename return_type::value_type;
@@ -1544,8 +1330,7 @@ struct ParallelReducePolicyType;
 
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
-    typename std::enable_if<
-        Kokkos::is_execution_policy<PolicyType>::value>::type,
+    std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value>,
     PolicyType, FunctorType> {
   using policy_type = PolicyType;
   static PolicyType policy(const PolicyType& policy_) { return policy_; }
@@ -1553,8 +1338,8 @@ struct ParallelReducePolicyType<
 
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
-    typename std::enable_if<std::is_integral<PolicyType>::value>::type,
-    PolicyType, FunctorType> {
+    std::enable_if_t<std::is_integral<PolicyType>::value>, PolicyType,
+    FunctorType> {
   using execution_space =
       typename Impl::FunctorPolicyExecutionSpace<FunctorType,
                                                  void>::execution_space;
@@ -1619,7 +1404,7 @@ struct ParallelReduceAdaptor {
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   template <typename Dummy = ReturnType>
   KOKKOS_DEPRECATED_WITH_COMMENT(
-      "Array reductions with a raw pointer return type a deprecated. Use a "
+      "Array reductions with a raw pointer return type are deprecated. Use a "
       "Kokkos::View as return argument!")
   static inline std::
       enable_if_t<is_array_reduction && std::is_pointer<Dummy>::value> execute(
@@ -1720,8 +1505,8 @@ struct ParallelReduceFence {
  *    using value_type = <podType>;
  *    void operator()( <intType> iwork , <podType> & update ) const ;
  *    void init( <podType> & update ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
+ *    void join(       <podType> & update ,
+ *               const <podType> & input ) const ;
  *
  *    void final( <podType> & update ) const ;
  *  };
@@ -1736,8 +1521,8 @@ struct ParallelReduceFence {
  *    using value_type = <podType>[];
  *    void operator()( <intType> , <podType> update[] ) const ;
  *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
+ *    void join(       <podType> update[] ,
+ *               const <podType> input[] ) const ;
  *
  *    void final( <podType> update[] ) const ;
  *  };
@@ -1915,16 +1700,17 @@ template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const std::string& label, const PolicyType& policy,
     const FunctorType& functor,
-    typename std::enable_if<
-        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
-                                        typename ValueTraits::value_type,
-                                        typename ValueTraits::pointer_type>;
+    std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value>* =
+        nullptr) {
+  using FunctorAnalysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                            FunctorType>;
+  using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
+                                        typename FunctorAnalysis::value_type,
+                                        typename FunctorAnalysis::pointer_type>;
 
   static_assert(
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                            FunctorType>::has_final_member_function,
+      FunctorAnalysis::has_final_member_function,
       "Calling parallel_reduce without either return value or final function.");
 
   using result_view_type =
@@ -1939,16 +1725,17 @@ inline void parallel_reduce(
 template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const PolicyType& policy, const FunctorType& functor,
-    typename std::enable_if<
-        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
-                                        typename ValueTraits::value_type,
-                                        typename ValueTraits::pointer_type>;
+    std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value>* =
+        nullptr) {
+  using FunctorAnalysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                            FunctorType>;
+  using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
+                                        typename FunctorAnalysis::value_type,
+                                        typename FunctorAnalysis::pointer_type>;
 
   static_assert(
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
-                            FunctorType>::has_final_member_function,
+      FunctorAnalysis::has_final_member_function,
       "Calling parallel_reduce without either return value or final function.");
 
   using result_view_type =
@@ -1965,15 +1752,15 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor) {
   using policy_type =
       typename Impl::ParallelReducePolicyType<void, size_t,
                                               FunctorType>::policy_type;
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
-                                        typename ValueTraits::value_type,
-                                        typename ValueTraits::pointer_type>;
+  using FunctorAnalysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, policy_type,
+                            FunctorType>;
+  using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
+                                        typename FunctorAnalysis::value_type,
+                                        typename FunctorAnalysis::pointer_type>;
 
   static_assert(
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                            RangePolicy<>,
-                            FunctorType>::has_final_member_function,
+      FunctorAnalysis::has_final_member_function,
       "Calling parallel_reduce without either return value or final function.");
 
   using result_view_type =
@@ -1992,15 +1779,15 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   using policy_type =
       typename Impl::ParallelReducePolicyType<void, size_t,
                                               FunctorType>::policy_type;
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
-                                        typename ValueTraits::value_type,
-                                        typename ValueTraits::pointer_type>;
+  using FunctorAnalysis =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, policy_type,
+                            FunctorType>;
+  using value_type = std::conditional_t<(FunctorAnalysis::StaticValueSize != 0),
+                                        typename FunctorAnalysis::value_type,
+                                        typename FunctorAnalysis::pointer_type>;
 
   static_assert(
-      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                            RangePolicy<>,
-                            FunctorType>::has_final_member_function,
+      FunctorAnalysis::has_final_member_function,
       "Calling parallel_reduce without either return value or final function.");
 
   using result_view_type =
diff --git a/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp b/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp
index f1f168c38..41b18a8d1 100644
--- a/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp
+++ b/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp
@@ -44,6 +44,15 @@
 
 // Experimental unified task-data parallel manycore LDRD
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_IMPL_POINTEROWNERSHIP_HPP
 #define KOKKOS_IMPL_POINTEROWNERSHIP_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp
index 4556cddba..266605c0f 100644
--- a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp
+++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOSP_PROFILE_SECTION_HPP
 #define KOKKOSP_PROFILE_SECTION_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_PROFILING_PROFILESECTION
+#endif
 
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
@@ -103,4 +107,8 @@ class ProfilingSection {
 }  // namespace Profiling
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING_PROFILESECTION
+#endif
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_Rank.hpp b/packages/kokkos/core/src/Kokkos_Rank.hpp
index 3603e2860..025cf511f 100644
--- a/packages/kokkos/core/src/Kokkos_Rank.hpp
+++ b/packages/kokkos/core/src/Kokkos_Rank.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_KOKKOS_RANK_HPP
 #define KOKKOS_KOKKOS_RANK_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index e29093db3..a7f169606 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_SYCL_HPP
 #define KOKKOS_SYCL_HPP
 
@@ -52,9 +61,9 @@
 #include <Kokkos_SYCL_Space.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 namespace Kokkos {
 namespace Experimental {
@@ -87,9 +96,9 @@ class SYCL {
     return m_space_instance->impl_get_instance_id();
   }
 
-  sycl::context sycl_context() const noexcept {
-    return m_space_instance->m_queue->get_context();
-  };
+  sycl::queue& sycl_queue() const noexcept {
+    return *m_space_instance->m_queue;
+  }
 
   //@}
   //------------------------------------
@@ -111,38 +120,19 @@ class SYCL {
   static bool wake();
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
-  static void impl_static_fence();
-  static void impl_static_fence(const std::string&);
-  void fence() const;
-  void fence(const std::string&) const;
+  static void impl_static_fence(const std::string& name);
+
+  void fence(
+      const std::string& name =
+          "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const;
 
   /// \brief Print configuration information to the given output stream.
-  void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   /// \brief Free any resources being consumed by the device.
   static void impl_finalize();
 
-  /** \brief  Initialize the device.
-   *
-   */
-
-  struct SYCLDevice {
-    SYCLDevice() : SYCLDevice(sycl::default_selector()) {}
-    explicit SYCLDevice(sycl::device d);
-    explicit SYCLDevice(const sycl::device_selector& selector);
-    explicit SYCLDevice(size_t id);
-
-    sycl::device get_device() const;
-
-    friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) {
-      return SYCL::impl_sycl_info(os, that.m_device);
-    }
-
-   private:
-    sycl::device m_device;
-  };
-
-  static void impl_initialize(SYCLDevice = SYCLDevice());
+  static void impl_initialize(InitializationSettings const&);
 
   int sycl_device() const;
 
@@ -162,18 +152,6 @@ class SYCL {
   Kokkos::Impl::HostSharedPtr<Impl::SYCLInternal> m_space_instance;
 };
 
-namespace Impl {
-
-class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
- public:
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
-}  // namespace Impl
 }  // namespace Experimental
 
 namespace Tools {
@@ -198,12 +176,13 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
       "Kokkos Error: partitioning arguments must be integers or floats");
 #endif
 
-  sycl::context context = sycl_space.sycl_context();
-  sycl::default_selector device_selector;
+  sycl::context context = sycl_space.sycl_queue().get_context();
+  sycl::device device =
+      sycl_space.impl_internal_space_instance()->m_queue->get_device();
   std::vector<SYCL> instances;
   instances.reserve(sizeof...(Args));
   for (unsigned int i = 0; i < sizeof...(Args); ++i)
-    instances.emplace_back(sycl::queue(context, device_selector));
+    instances.emplace_back(sycl::queue(context, device));
   return instances;
 }
 
@@ -214,12 +193,13 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space,
       std::is_arithmetic<T>::value,
       "Kokkos Error: partitioning arguments must be integers or floats");
 
-  sycl::context context = sycl_space.sycl_context();
-  sycl::default_selector device_selector;
+  sycl::context context = sycl_space.sycl_queue().get_context();
+  sycl::device device =
+      sycl_space.impl_internal_space_instance()->m_queue->get_device();
   std::vector<SYCL> instances;
   instances.reserve(weights.size());
   for (unsigned int i = 0; i < weights.size(); ++i)
-    instances.emplace_back(sycl::queue(context, device_selector));
+    instances.emplace_back(sycl::queue(context, device));
   return instances;
 }
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
index 15ef11024..e147d04dc 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_SYCLSPACE_HPP
 #define KOKKOS_SYCLSPACE_HPP
 
@@ -74,6 +83,11 @@ class SYCLDeviceUSMSpace {
   SYCLDeviceUSMSpace();
   explicit SYCLDeviceUSMSpace(sycl::queue queue);
 
+  void* allocate(const SYCL& exec_space,
+                 const std::size_t arg_alloc_size) const;
+  void* allocate(const SYCL& exec_space, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
   void* allocate(const std::size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -105,6 +119,11 @@ class SYCLSharedUSMSpace {
   SYCLSharedUSMSpace();
   explicit SYCLSharedUSMSpace(sycl::queue queue);
 
+  void* allocate(const SYCL& exec_space,
+                 const std::size_t arg_alloc_size) const;
+  void* allocate(const SYCL& exec_space, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
   void* allocate(const std::size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -136,6 +155,11 @@ class SYCLHostUSMSpace {
   SYCLHostUSMSpace();
   explicit SYCLHostUSMSpace(sycl::queue queue);
 
+  void* allocate(const SYCL& exec_space,
+                 const std::size_t arg_alloc_size) const;
+  void* allocate(const SYCL& exec_space, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
   void* allocate(const std::size_t arg_alloc_size) const;
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
@@ -347,6 +371,21 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>
  protected:
   ~SharedAllocationRecord();
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCL& exec_space,
+      const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+
   SharedAllocationRecord(
       const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
@@ -378,6 +417,21 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
 
   SharedAllocationRecord() = default;
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCL& exec_space,
+      const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+
   SharedAllocationRecord(
       const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
@@ -409,6 +463,21 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
 
   SharedAllocationRecord() = default;
 
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCL& exec_space,
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+
   SharedAllocationRecord(
       const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
       const std::string& arg_label, const size_t arg_alloc_size,
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
index bb740cfb8..3e37eb61d 100644
--- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_SCRATCHSPACE_HPP
 #define KOKKOS_SCRATCHSPACE_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
index b2e524c37..ffdd1e9fc 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -45,6 +45,15 @@
 /// \file Kokkos_Serial.hpp
 /// \brief Declaration and definition of Kokkos::Serial device.
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_SERIAL_HPP
 #define KOKKOS_SERIAL_HPP
 
@@ -56,7 +65,6 @@
 #include <mutex>
 #include <thread>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_HostSpace.hpp>
@@ -64,14 +72,9 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-#include <Kokkos_UniqueToken.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 namespace Kokkos {
 
@@ -151,10 +154,6 @@ class Serial {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence() {
-    impl_static_fence(
-        "Kokkos::Serial::impl_static_fence: Unnamed Static Fence");
-  }
   static void impl_static_fence(const std::string& name) {
     Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
         name,
@@ -164,8 +163,8 @@ class Serial {
     Kokkos::memory_fence();
   }
 
-  void fence() const { fence("Kokkos::Serial::fence: Unnamed Instance Fence"); }
-  void fence(const std::string& name) const {
+  void fence(const std::string& name =
+                 "Kokkos::Serial::fence: Unnamed Instance Fence") const {
     Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
         name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
         []() {});  // TODO: correct device ID
@@ -176,10 +175,9 @@ class Serial {
   static int concurrency() { return 1; }
 
   //! Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&,
-                                  const bool /* detail */ = false) {}
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
-  static void impl_initialize();
+  static void impl_initialize(InitializationSettings const&);
 
   static bool impl_is_initialized();
 
@@ -230,21 +228,6 @@ struct DeviceTypeTraits<Serial> {
 };
 }  // namespace Experimental
 }  // namespace Tools
-
-namespace Impl {
-
-class SerialSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  SerialSpaceInitializer()  = default;
-  ~SerialSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -253,6 +236,23 @@ class SerialSpaceInitializer : public ExecSpaceInitializerBase {
 namespace Kokkos {
 namespace Impl {
 
+// We only need to provide a specialization for Serial if there is a host
+// parallel execution space since the specialization for
+// DefaultHostExecutionSpace is defined elsewhere.
+struct DummyExecutionSpace;
+template <class DT, class... DP>
+struct ZeroMemset<
+    std::conditional_t<!std::is_same<Serial, DefaultHostExecutionSpace>::value,
+                       Serial, DummyExecutionSpace>,
+    DT, DP...> : public ZeroMemset<DefaultHostExecutionSpace, DT, DP...> {
+  using Base = ZeroMemset<DefaultHostExecutionSpace, DT, DP...>;
+  using Base::Base;
+
+  ZeroMemset(const Serial&, const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type& value)
+      : Base(dst, value) {}
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Serial::memory_space,
                          Kokkos::Serial::scratch_memory_space> {
@@ -264,923 +264,11 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-/*
- * < Kokkos::Serial , WorkArgTag >
- * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
- * Kokkos::DefaultExecutionSpace >::value >::type >
- *
- */
-template <class... Properties>
-class TeamPolicyInternal<Kokkos::Serial, Properties...>
-    : public PolicyTraits<Properties...> {
- private:
-  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size[2];
-  int m_league_size;
-  int m_chunk_size;
-
- public:
-  //! Tag this class as a kokkos execution policy
-  using execution_policy = TeamPolicyInternal;
-
-  using traits = PolicyTraits<Properties...>;
-
-  //! Execution space of this execution policy:
-  using execution_space = Kokkos::Serial;
-
-  const typename traits::execution_space& space() const {
-    static typename traits::execution_space m_space;
-    return m_space;
-  }
-
-  template <class ExecSpace, class... OtherProperties>
-  friend class TeamPolicyInternal;
-
-  template <class... OtherProperties>
-  TeamPolicyInternal(
-      const TeamPolicyInternal<Kokkos::Serial, OtherProperties...>& p) {
-    m_league_size            = p.m_league_size;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-  }
-
-  //----------------------------------------
-
-  template <class FunctorType>
-  int team_size_max(const FunctorType&, const ParallelForTag&) const {
-    return 1;
-  }
-  template <class FunctorType>
-  int team_size_max(const FunctorType&, const ParallelReduceTag&) const {
-    return 1;
-  }
-  template <class FunctorType, class ReducerType>
-  int team_size_max(const FunctorType&, const ReducerType&,
-                    const ParallelReduceTag&) const {
-    return 1;
-  }
-  template <class FunctorType>
-  int team_size_recommended(const FunctorType&, const ParallelForTag&) const {
-    return 1;
-  }
-  template <class FunctorType>
-  int team_size_recommended(const FunctorType&,
-                            const ParallelReduceTag&) const {
-    return 1;
-  }
-  template <class FunctorType, class ReducerType>
-  int team_size_recommended(const FunctorType&, const ReducerType&,
-                            const ParallelReduceTag&) const {
-    return 1;
-  }
-
-  //----------------------------------------
-
-  inline int team_size() const { return 1; }
-  inline bool impl_auto_team_size() const { return false; }
-  inline bool impl_auto_vector_length() const { return false; }
-  inline void impl_set_team_size(size_t) {}
-  inline void impl_set_vector_length(size_t) {}
-  inline int league_size() const { return m_league_size; }
-  inline size_t scratch_size(const int& level, int = 0) const {
-    return m_team_scratch_size[level] + m_thread_scratch_size[level];
-  }
-
-  inline int impl_vector_length() const { return 1; }
-  inline static int vector_length_max() {
-    return 1024;
-  }  // Use arbitrary large number, is meant as a vectorizable length
-
-  inline static int scratch_size_max(int level) {
-    return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024);
-  }
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal(const execution_space&, int league_size_request,
-                     int team_size_request, int /* vector_length_request */ = 1)
-      : m_team_scratch_size{0, 0},
-        m_thread_scratch_size{0, 0},
-        m_league_size(league_size_request),
-        m_chunk_size(32) {
-    if (team_size_request > 1)
-      Kokkos::abort("Kokkos::abort: Requested Team Size is too large!");
-  }
-
-  TeamPolicyInternal(const execution_space& space, int league_size_request,
-                     const Kokkos::AUTO_t& /**team_size_request*/,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(space, league_size_request, -1,
-                           vector_length_request) {}
-
-  TeamPolicyInternal(const execution_space& space, int league_size_request,
-                     const Kokkos::AUTO_t& /* team_size_request */
-                     ,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-                     )
-      : TeamPolicyInternal(space, league_size_request, -1, -1) {}
-
-  TeamPolicyInternal(const execution_space& space, int league_size_request,
-                     int team_size_request,
-                     const Kokkos::AUTO_t& /* vector_length_request */
-                     )
-      : TeamPolicyInternal(space, league_size_request, team_size_request, -1) {}
-
-  TeamPolicyInternal(int league_size_request,
-                     const Kokkos::AUTO_t& team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(),
-                           league_size_request, team_size_request,
-                           vector_length_request) {}
-
-  TeamPolicyInternal(int league_size_request,
-                     const Kokkos::AUTO_t& team_size_request,
-                     const Kokkos::AUTO_t& vector_length_request)
-      : TeamPolicyInternal(typename traits::execution_space(),
-                           league_size_request, team_size_request,
-                           vector_length_request) {}
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     const Kokkos::AUTO_t& vector_length_request)
-      : TeamPolicyInternal(typename traits::execution_space(),
-                           league_size_request, team_size_request,
-                           vector_length_request) {}
-
-  TeamPolicyInternal(int league_size_request, int team_size_request,
-                     int vector_length_request = 1)
-      : TeamPolicyInternal(typename traits::execution_space(),
-                           league_size_request, team_size_request,
-                           vector_length_request) {}
-
-  inline int chunk_size() const { return m_chunk_size; }
-
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal& set_chunk_size(
-      typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(const int& level,
-                                              const PerTeamValue& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerThreadValue& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  inline TeamPolicyInternal& set_scratch_size(
-      const int& level, const PerTeamValue& per_team,
-      const PerThreadValue& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  using member_type = Impl::HostThreadTeamMember<Kokkos::Serial>;
-};
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Parallel patterns for Kokkos::Serial with RangePolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type exec()
-      const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i);
-    }
-  }
-
-  template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type exec()
-      const {
-    const TagType t{};
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i);
-    }
-  }
-
- public:
-  inline void execute() const {
-    this->template exec<typename Policy::work_tag>();
-  }
-
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-/*--------------------------------------------------------------------------*/
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Serial> {
- private:
-  using Policy  = Kokkos::RangePolicy<Traits...>;
-  using WorkTag = typename Policy::work_tag;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec(
-      reference_type update) const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update);
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  exec(reference_type update) const {
-    const TagType t{};
-
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-    const size_t team_reduce_size  = 0;  // Never shrinks
-    const size_t team_shared_size  = 0;  // Never shrinks
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    // Need to lock resize_thread_team_data
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    pointer_type ptr =
-        m_result_ptr
-            ? m_result_ptr
-            : pointer_type(
-                  internal_instance->m_thread_team_data.pool_reduce_local());
-
-    reference_type update =
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
-
-    this->template exec<WorkTag>(update);
-
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
-  }
-
-  template <class HostViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const HostViewType& arg_result_view,
-      typename std::enable_if<Kokkos::is_view<HostViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Serial reduce result must be a View");
-
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Kokkos::Serial reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-};
-
-/*--------------------------------------------------------------------------*/
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Serial> {
- private:
-  using Policy  = Kokkos::RangePolicy<Traits...>;
-  using WorkTag = typename Policy::work_tag;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec(
-      reference_type update) const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update, true);
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  exec(reference_type update) const {
-    const TagType t{};
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update, true);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
-    const size_t team_reduce_size  = 0;  // Never shrinks
-    const size_t team_shared_size  = 0;  // Never shrinks
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    // Need to lock resize_thread_team_data
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    reference_type update = ValueInit::init(
-        m_functor,
-        pointer_type(
-            internal_instance->m_thread_team_data.pool_reduce_local()));
-
-    this->template exec<WorkTag>(update);
-  }
-
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-/*--------------------------------------------------------------------------*/
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Serial> {
- private:
-  using Policy  = Kokkos::RangePolicy<Traits...>;
-  using WorkTag = typename Policy::work_tag;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  ReturnType& m_returnvalue;
-
-  template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec(
-      reference_type update) const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(i, update, true);
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  exec(reference_type update) const {
-    const TagType t{};
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      m_functor(t, i, update, true);
-    }
-  }
-
- public:
-  inline void execute() {
-    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
-    const size_t team_reduce_size  = 0;  // Never shrinks
-    const size_t team_shared_size  = 0;  // Never shrinks
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    // Need to lock resize_thread_team_data
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    reference_type update = ValueInit::init(
-        m_functor,
-        pointer_type(
-            internal_instance->m_thread_team_data.pool_reduce_local()));
-
-    this->template exec<WorkTag>(update);
-
-    m_returnvalue = update;
-  }
-
-  inline ParallelScanWithTotal(const FunctorType& arg_functor,
-                               const Policy& arg_policy,
-                               ReturnType& arg_returnvalue)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_returnvalue(arg_returnvalue) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                  Kokkos::Serial> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using iterate_type = typename Kokkos::Impl::HostIterateTile<
-      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
-
-  const FunctorType m_functor;
-  const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;
-
-  void exec() const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      iterate_type(m_mdr_policy, m_functor)(i);
-    }
-  }
-
- public:
-  inline void execute() const { this->exec(); }
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy&, const Functor&) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-  inline ParallelFor(const FunctorType& arg_functor,
-                     const MDRangePolicy& arg_policy)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
-};
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Serial> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag = typename MDRangePolicy::work_tag;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, FunctorType>;
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using value_type     = typename Analysis::value_type;
-  using reference_type = typename Analysis::reference_type;
-
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
-
-  const FunctorType m_functor;
-  const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  inline void exec(reference_type update) const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
-      iterate_type(m_mdr_policy, m_functor, update)(i);
-    }
-  }
-
- public:
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy&, const Functor&) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-  inline void execute() const {
-    const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-    const size_t team_reduce_size  = 0;  // Never shrinks
-    const size_t team_shared_size  = 0;  // Never shrinks
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    // Need to lock resize_thread_team_data
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    pointer_type ptr =
-        m_result_ptr
-            ? m_result_ptr
-            : pointer_type(
-                  internal_instance->m_thread_team_data.pool_reduce_local());
-
-    reference_type update =
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
-
-    this->exec(update);
-
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
-  }
-
-  template <class HostViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const MDRangePolicy& arg_policy,
-      const HostViewType& arg_result_view,
-      typename std::enable_if<Kokkos::is_view<HostViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Serial reduce result must be a View");
-
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Kokkos::Serial reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Parallel patterns for Kokkos::Serial with TeamPolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Serial> {
- private:
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>;
-  using Member = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const int m_league;
-  const int m_shared;
-
-  template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec(
-      HostThreadTeamData& data) const {
-    for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(Member(data, ileague, m_league));
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  exec(HostThreadTeamData& data) const {
-    const TagType t{};
-    for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(t, Member(data, ileague, m_league));
-    }
-  }
-
- public:
-  inline void execute() const {
-    const size_t pool_reduce_size  = 0;  // Never shrinks
-    const size_t team_reduce_size  = TEAM_REDUCE_SIZE;
-    const size_t team_shared_size  = m_shared;
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    // Need to lock resize_thread_team_data
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    this->template exec<typename Policy::work_tag>(
-        internal_instance->m_thread_team_data);
-  }
-
-  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league(arg_policy.league_size()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
-};
-
-/*--------------------------------------------------------------------------*/
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Serial> {
- private:
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>;
-
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
-
-  using Member  = typename Policy::member_type;
-  using WorkTag = typename Policy::work_tag;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename Analysis::pointer_type;
-  using reference_type = typename Analysis::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const int m_league;
-  const ReducerType m_reducer;
-  pointer_type m_result_ptr;
-  const int m_shared;
-
-  template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec(
-      HostThreadTeamData& data, reference_type update) const {
-    for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(Member(data, ileague, m_league), update);
-    }
-  }
-
-  template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  exec(HostThreadTeamData& data, reference_type update) const {
-    const TagType t{};
-
-    for (int ileague = 0; ileague < m_league; ++ileague) {
-      m_functor(t, Member(data, ileague, m_league), update);
-    }
-  }
-
- public:
-  inline void execute() const {
-    const size_t pool_reduce_size =
-        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
-
-    const size_t team_reduce_size  = TEAM_REDUCE_SIZE;
-    const size_t team_shared_size  = m_shared;
-    const size_t thread_local_size = 0;  // Never shrinks
-
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
-    // Need to lock resize_thread_team_data
-    std::lock_guard<std::mutex> lock(
-        internal_instance->m_thread_team_data_mutex);
-    internal_instance->resize_thread_team_data(
-        pool_reduce_size, team_reduce_size, team_shared_size,
-        thread_local_size);
-
-    pointer_type ptr =
-        m_result_ptr
-            ? m_result_ptr
-            : pointer_type(
-                  internal_instance->m_thread_team_data.pool_reduce_local());
-
-    reference_type update =
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
-
-    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
-
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
-  }
-
-  template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league(arg_policy.league_size()),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(m_functor, 1)) {
-    static_assert(Kokkos::is_view<ViewType>::value,
-                  "Reduction result on Kokkos::Serial must be a Kokkos::View");
-
-    static_assert(
-        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
-                                        Kokkos::HostSpace>::accessible,
-        "Reduction result on Kokkos::Serial must be a Kokkos::View in "
-        "HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_league(arg_policy.league_size()),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                            , Kokkos::HostSpace >::value
-    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-    );*/
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Experimental {
-
-template <>
-class UniqueToken<Serial, UniqueTokenScope::Instance> {
- public:
-  using execution_space = Serial;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const& = execution_space()) noexcept {}
-
-  /// \brief create object size for requested size on given instance
-  ///
-  /// It is the users responsibility to only acquire size tokens concurrently
-  UniqueToken(size_type, execution_space const& = execution_space()) {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept { return 1; }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept { return 0; }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int) const noexcept {}
-};
-
-template <>
-class UniqueToken<Serial, UniqueTokenScope::Global> {
- public:
-  using execution_space = Serial;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const& = execution_space()) noexcept {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept { return 1; }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept { return 0; }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int) const noexcept {}
-};
-
-}  // namespace Experimental
-}  // namespace Kokkos
-
-#include <impl/Kokkos_Serial_Task.hpp>
+#include <Serial/Kokkos_Serial_Parallel_Range.hpp>
+#include <Serial/Kokkos_Serial_Parallel_MDRange.hpp>
+#include <Serial/Kokkos_Serial_Parallel_Team.hpp>
+#include <Serial/Kokkos_Serial_Task.hpp>
+#include <Serial/Kokkos_Serial_UniqueToken.hpp>
 
 #endif  // defined( KOKKOS_ENABLE_SERIAL )
 #endif  /* #define KOKKOS_SERIAL_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
index e45feb855..c3453b79e 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_TASKSCHEDULER_HPP
 #define KOKKOS_TASKSCHEDULER_HPP
 
@@ -145,7 +154,7 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
                   typename task_base::destroy_type /*arg_destroy*/,
                   FunctorType&& arg_functor) {
     using functor_future_type =
-        future_type_for_functor<typename std::decay<FunctorType>::type>;
+        future_type_for_functor<std::decay_t<FunctorType>>;
     using task_type =
         Impl::Task<BasicTaskScheduler, typename functor_future_type::value_type,
                    FunctorType>;
@@ -301,11 +310,9 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
   }
 
   template <int TaskEnum, typename DepFutureType, typename FunctorType>
-  KOKKOS_FUNCTION
-      future_type_for_functor<typename std::decay<FunctorType>::type>
-      spawn(
-          Impl::TaskPolicyWithPredecessor<TaskEnum, DepFutureType>&& arg_policy,
-          FunctorType&& arg_functor) {
+  KOKKOS_FUNCTION future_type_for_functor<std::decay_t<FunctorType>> spawn(
+      Impl::TaskPolicyWithPredecessor<TaskEnum, DepFutureType>&& arg_policy,
+      FunctorType&& arg_functor) {
     using task_type = runnable_task_type<FunctorType>;
     typename task_type::function_type const ptr = task_type::apply;
     typename task_type::destroy_type const dtor = task_type::destroy;
@@ -521,7 +528,7 @@ namespace Kokkos {
 
 template <class T, class Scheduler>
 Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskTeam,
-                                Kokkos::BasicFuture<T, Scheduler> >
+                                Kokkos::BasicFuture<T, Scheduler>>
     KOKKOS_INLINE_FUNCTION
     TaskTeam(Kokkos::BasicFuture<T, Scheduler> arg_future,
              TaskPriority arg_priority = TaskPriority::Regular) {
@@ -530,23 +537,22 @@ Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskTeam,
 
 template <class Scheduler>
 Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskTeam, Scheduler>
-    KOKKOS_INLINE_FUNCTION
-    TaskTeam(Scheduler arg_scheduler,
-             typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value,
-                                     TaskPriority>::type arg_priority =
-                 TaskPriority::Regular) {
+    KOKKOS_INLINE_FUNCTION TaskTeam(
+        Scheduler arg_scheduler,
+        std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value, TaskPriority>
+            arg_priority = TaskPriority::Regular) {
   return {std::move(arg_scheduler), arg_priority};
 }
 
 template <class Scheduler, class PredecessorFuture>
 Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskTeam, Scheduler,
                               PredecessorFuture>
-    KOKKOS_INLINE_FUNCTION TaskTeam(
-        Scheduler arg_scheduler, PredecessorFuture arg_future,
-        typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value &&
-                                    Kokkos::is_future<PredecessorFuture>::value,
-                                TaskPriority>::type arg_priority =
-            TaskPriority::Regular) {
+    KOKKOS_INLINE_FUNCTION
+    TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future,
+             std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value &&
+                                  Kokkos::is_future<PredecessorFuture>::value,
+                              TaskPriority>
+                 arg_priority = TaskPriority::Regular) {
   static_assert(std::is_same<typename PredecessorFuture::scheduler_type,
                              Scheduler>::value,
                 "Can't create a task policy from a scheduler and a future from "
@@ -559,7 +565,7 @@ Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskTeam, Scheduler,
 
 template <class T, class Scheduler>
 Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskSingle,
-                                Kokkos::BasicFuture<T, Scheduler> >
+                                Kokkos::BasicFuture<T, Scheduler>>
     KOKKOS_INLINE_FUNCTION
     TaskSingle(Kokkos::BasicFuture<T, Scheduler> arg_future,
                TaskPriority arg_priority = TaskPriority::Regular) {
@@ -568,23 +574,22 @@ Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskSingle,
 
 template <class Scheduler>
 Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskSingle, Scheduler>
-    KOKKOS_INLINE_FUNCTION
-    TaskSingle(Scheduler arg_scheduler,
-               typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value,
-                                       TaskPriority>::type arg_priority =
-                   TaskPriority::Regular) {
+    KOKKOS_INLINE_FUNCTION TaskSingle(
+        Scheduler arg_scheduler,
+        std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value, TaskPriority>
+            arg_priority = TaskPriority::Regular) {
   return {std::move(arg_scheduler), arg_priority};
 }
 
 template <class Scheduler, class PredecessorFuture>
 Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskSingle, Scheduler,
                               PredecessorFuture>
-    KOKKOS_INLINE_FUNCTION TaskSingle(
-        Scheduler arg_scheduler, PredecessorFuture arg_future,
-        typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value &&
+    KOKKOS_INLINE_FUNCTION
+    TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future,
+               std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value &&
                                     Kokkos::is_future<PredecessorFuture>::value,
-                                TaskPriority>::type arg_priority =
-            TaskPriority::Regular) {
+                                TaskPriority>
+                   arg_priority = TaskPriority::Regular) {
   static_assert(std::is_same<typename PredecessorFuture::scheduler_type,
                              Scheduler>::value,
                 "Can't create a task policy from a scheduler and a future from "
@@ -603,8 +608,7 @@ Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskSingle, Scheduler,
  */
 template <int TaskEnum, typename Scheduler, typename DepFutureType,
           typename FunctorType>
-typename Scheduler::template future_type_for_functor<
-    typename std::decay<FunctorType>::type>
+typename Scheduler::template future_type_for_functor<std::decay_t<FunctorType>>
 host_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType>
                arg_policy,
            FunctorType&& arg_functor) {
@@ -635,8 +639,7 @@ host_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType>
  */
 template <int TaskEnum, typename Scheduler, typename DepFutureType,
           typename FunctorType>
-typename Scheduler::template future_type_for_functor<
-    typename std::decay<FunctorType>::type>
+typename Scheduler::template future_type_for_functor<std::decay_t<FunctorType>>
     KOKKOS_INLINE_FUNCTION
     task_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType>
                    arg_policy,
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp
index 28af6345d..075a9bae2 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_TASKSCHEDULER_FWD_HPP
 #define KOKKOS_TASKSCHEDULER_FWD_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
index 5879209f1..e6dcad54c 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_THREADS_HPP
 #define KOKKOS_THREADS_HPP
 
@@ -57,7 +66,7 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -99,7 +108,7 @@ class Threads {
   static int in_parallel();
 
   /// \brief Print configuration information to the given output stream.
-  static void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose = false) const;
 
   /// \brief Wait until all dispatched functors complete.
   ///
@@ -107,11 +116,10 @@ class Threads {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence();
   static void impl_static_fence(const std::string& name);
 
-  void fence() const;
-  void fence(const std::string&) const;
+  void fence(const std::string& name =
+                 "Kokkos::Threads::fence: Unnamed Instance Fence") const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -127,18 +135,7 @@ class Threads {
   //! \name Space-specific functions
   //@{
 
-  /**
-   *  Teams of threads are distributed as evenly as possible across
-   *  the requested number of numa regions and cores per numa region.
-   *  A team will not be split across a numa region.
-   *
-   *  If the 'use_' arguments are not supplied, the hwloc is queried
-   *  to use all available cores.
-   */
-  static void impl_initialize(unsigned threads_count             = 0,
-                              unsigned use_numa_count            = 0,
-                              unsigned use_cores_per_numa        = 0,
-                              bool allow_asynchronous_threadpool = false);
+  static void impl_initialize(InitializationSettings const&);
 
   static int impl_is_initialized();
 
@@ -179,21 +176,6 @@ struct DeviceTypeTraits<Threads> {
 };
 }  // namespace Experimental
 }  // namespace Tools
-
-namespace Impl {
-
-class ThreadsSpaceInitializer : public ExecSpaceInitializerBase {
- public:
-  ThreadsSpaceInitializer()  = default;
-  ~ThreadsSpaceInitializer() = default;
-  void initialize(const InitArguments& args) final;
-  void finalize(const bool) final;
-  void fence() final;
-  void fence(const std::string&) final;
-  void print_configuration(std::ostream& msg, const bool detail) final;
-};
-
-}  // namespace Impl
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -218,7 +200,10 @@ struct MemorySpaceAccess<Kokkos::Threads::memory_space,
 #include <Kokkos_Parallel.hpp>
 #include <Threads/Kokkos_ThreadsExec.hpp>
 #include <Threads/Kokkos_ThreadsTeam.hpp>
-#include <Threads/Kokkos_Threads_Parallel.hpp>
+#include <Threads/Kokkos_Threads_Parallel_Range.hpp>
+#include <Threads/Kokkos_Threads_Parallel_MDRange.hpp>
+#include <Threads/Kokkos_Threads_Parallel_Team.hpp>
+#include <Threads/Kokkos_Threads_UniqueToken.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp
index a3a0b3257..38309b0a3 100644
--- a/packages/kokkos/core/src/Kokkos_Timer.hpp
+++ b/packages/kokkos/core/src/Kokkos_Timer.hpp
@@ -44,6 +44,10 @@
 
 #ifndef KOKKOS_TIMER_HPP
 #define KOKKOS_TIMER_HPP
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_TIMER
+#endif
 
 #include <Kokkos_Macros.hpp>
 // gcc 10.3.0 with CUDA doesn't support std::chrono,
@@ -111,4 +115,8 @@ class Timer {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_TIMER
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_TIMER
+#endif
 #endif /* #ifndef KOKKOS_TIMER_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp
index 52edd8205..dba602732 100644
--- a/packages/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_KOKKOS_TUNERS_HPP
 #define KOKKOS_KOKKOS_TUNERS_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_UniqueToken.hpp b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp
index c6c1e7cea..3c58423d3 100644
--- a/packages/kokkos/core/src/Kokkos_UniqueToken.hpp
+++ b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_UNIQUE_TOKEN_HPP
 #define KOKKOS_UNIQUE_TOKEN_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_Vectorization.hpp b/packages/kokkos/core/src/Kokkos_Vectorization.hpp
index a232e5b3a..4314ea441 100644
--- a/packages/kokkos/core/src/Kokkos_Vectorization.hpp
+++ b/packages/kokkos/core/src/Kokkos_Vectorization.hpp
@@ -44,6 +44,15 @@
 
 /// \file Kokkos_Vectorization.hpp
 /// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_VECTORIZATION_HPP
 #define KOKKOS_VECTORIZATION_HPP
 
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index b8d33e30c..e92ed7d2e 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_VIEW_HPP
 #define KOKKOS_VIEW_HPP
 
@@ -54,6 +63,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <View/Hooks/Kokkos_ViewHooks.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 
@@ -79,7 +89,7 @@ class ViewMapping {
 };
 
 template <typename IntType>
-KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers(
+constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers(
     const IntType i0, const IntType i1, const IntType i2, const IntType i3,
     const IntType i4, const IntType i5, const IntType i6, const IntType i7) {
   static_assert(std::is_integral<IntType>::value,
@@ -92,40 +102,27 @@ KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers(
 }
 
 KOKKOS_INLINE_FUNCTION
-void runtime_check_rank_device(const size_t dyn_rank, const bool is_void_spec,
-                               const size_t i0, const size_t i1,
-                               const size_t i2, const size_t i3,
-                               const size_t i4, const size_t i5,
-                               const size_t i6, const size_t i7) {
-  if (is_void_spec) {
-    const size_t num_passed_args =
-        count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7);
-
-    if (num_passed_args != dyn_rank && is_void_spec) {
-      Kokkos::abort(
-          "Number of arguments passed to Kokkos::View() constructor must match "
-          "the dynamic rank of the view.");
-    }
-  }
-}
+void runtime_check_rank(const size_t rank, const size_t dyn_rank,
+                        const bool is_void_spec, const size_t i0,
+                        const size_t i1, const size_t i2, const size_t i3,
+                        const size_t i4, const size_t i5, const size_t i6,
+                        const size_t i7, const std::string& label) {
+  (void)(label);
 
-inline void runtime_check_rank_host(const size_t dyn_rank,
-                                    const bool is_void_spec, const size_t i0,
-                                    const size_t i1, const size_t i2,
-                                    const size_t i3, const size_t i4,
-                                    const size_t i5, const size_t i6,
-                                    const size_t i7, const std::string& label) {
   if (is_void_spec) {
     const size_t num_passed_args =
         count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7);
 
-    if (num_passed_args != dyn_rank) {
-      const std::string message =
-          "Constructor for Kokkos View '" + label +
-          "' has mismatched number of arguments. Number of arguments = " +
-          std::to_string(num_passed_args) +
-          " but dynamic rank = " + std::to_string(dyn_rank) + " \n";
-      Kokkos::abort(message.c_str());
+    if (num_passed_args != dyn_rank && num_passed_args != rank) {
+      KOKKOS_IF_ON_HOST(
+          const std::string message =
+              "Constructor for Kokkos View '" + label +
+              "' has mismatched number of arguments. Number of arguments = " +
+              std::to_string(num_passed_args) +
+              " but dynamic rank = " + std::to_string(dyn_rank) + " \n";
+          Kokkos::abort(message.c_str());)
+      KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has "
+                                        "mismatched number of arguments.");)
     }
   }
 }
@@ -174,6 +171,7 @@ struct ViewTraits<void> {
   using array_layout    = void;
   using memory_traits   = void;
   using specialize      = void;
+  using hooks_policy    = void;
 };
 
 template <class... Prop>
@@ -185,12 +183,25 @@ struct ViewTraits<void, void, Prop...> {
   using array_layout    = typename ViewTraits<void, Prop...>::array_layout;
   using memory_traits   = typename ViewTraits<void, Prop...>::memory_traits;
   using specialize      = typename ViewTraits<void, Prop...>::specialize;
+  using hooks_policy    = typename ViewTraits<void, Prop...>::hooks_policy;
 };
 
-template <class ArrayLayout, class... Prop>
+template <class HooksPolicy, class... Prop>
 struct ViewTraits<
-    typename std::enable_if<Kokkos::is_array_layout<ArrayLayout>::value>::type,
-    ArrayLayout, Prop...> {
+    std::enable_if_t<Kokkos::Experimental::is_hooks_policy<HooksPolicy>::value>,
+    HooksPolicy, Prop...> {
+  using execution_space = typename ViewTraits<void, Prop...>::execution_space;
+  using memory_space    = typename ViewTraits<void, Prop...>::memory_space;
+  using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace;
+  using array_layout    = typename ViewTraits<void, Prop...>::array_layout;
+  using memory_traits   = typename ViewTraits<void, Prop...>::memory_traits;
+  using specialize      = typename ViewTraits<void, Prop...>::specialize;
+  using hooks_policy    = HooksPolicy;
+};
+
+template <class ArrayLayout, class... Prop>
+struct ViewTraits<std::enable_if_t<Kokkos::is_array_layout<ArrayLayout>::value>,
+                  ArrayLayout, Prop...> {
   // Specify layout, keep subsequent space and memory traits arguments
 
   using execution_space = typename ViewTraits<void, Prop...>::execution_space;
@@ -199,11 +210,12 @@ struct ViewTraits<
   using array_layout    = ArrayLayout;
   using memory_traits   = typename ViewTraits<void, Prop...>::memory_traits;
   using specialize      = typename ViewTraits<void, Prop...>::specialize;
+  using hooks_policy    = typename ViewTraits<void, Prop...>::hooks_policy;
 };
 
 template <class Space, class... Prop>
-struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
-                  Space, Prop...> {
+struct ViewTraits<std::enable_if_t<Kokkos::is_space<Space>::value>, Space,
+                  Prop...> {
   // Specify Space, memory traits should be the only subsequent argument.
 
   static_assert(
@@ -224,12 +236,13 @@ struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
   using array_layout  = typename execution_space::array_layout;
   using memory_traits = typename ViewTraits<void, Prop...>::memory_traits;
   using specialize    = typename ViewTraits<void, Prop...>::specialize;
+  using hooks_policy  = typename ViewTraits<void, Prop...>::hooks_policy;
 };
 
 template <class MemoryTraits, class... Prop>
-struct ViewTraits<typename std::enable_if<
-                      Kokkos::is_memory_traits<MemoryTraits>::value>::type,
-                  MemoryTraits, Prop...> {
+struct ViewTraits<
+    std::enable_if_t<Kokkos::is_memory_traits<MemoryTraits>::value>,
+    MemoryTraits, Prop...> {
   // Specify memory trait, should not be any subsequent arguments
 
   static_assert(
@@ -240,6 +253,8 @@ struct ViewTraits<typename std::enable_if<
           std::is_same<typename ViewTraits<void, Prop...>::array_layout,
                        void>::value &&
           std::is_same<typename ViewTraits<void, Prop...>::memory_traits,
+                       void>::value &&
+          std::is_same<typename ViewTraits<void, Prop...>::hooks_policy,
                        void>::value,
       "MemoryTrait is the final optional template argument for a View");
 
@@ -249,6 +264,7 @@ struct ViewTraits<typename std::enable_if<
   using array_layout    = void;
   using memory_traits   = MemoryTraits;
   using specialize      = void;
+  using hooks_policy    = void;
 };
 
 template <class DataType, class... Properties>
@@ -257,26 +273,35 @@ struct ViewTraits {
   // Unpack the properties arguments
   using prop = ViewTraits<void, Properties...>;
 
-  using ExecutionSpace = typename std::conditional<
-      !std::is_same<typename prop::execution_space, void>::value,
-      typename prop::execution_space, Kokkos::DefaultExecutionSpace>::type;
+  using ExecutionSpace =
+      std::conditional_t<!std::is_void<typename prop::execution_space>::value,
+                         typename prop::execution_space,
+                         Kokkos::DefaultExecutionSpace>;
 
-  using MemorySpace = typename std::conditional<
-      !std::is_same<typename prop::memory_space, void>::value,
-      typename prop::memory_space, typename ExecutionSpace::memory_space>::type;
+  using MemorySpace =
+      std::conditional_t<!std::is_void<typename prop::memory_space>::value,
+                         typename prop::memory_space,
+                         typename ExecutionSpace::memory_space>;
 
-  using ArrayLayout = typename std::conditional<
-      !std::is_same<typename prop::array_layout, void>::value,
-      typename prop::array_layout, typename ExecutionSpace::array_layout>::type;
+  using ArrayLayout =
+      std::conditional_t<!std::is_void<typename prop::array_layout>::value,
+                         typename prop::array_layout,
+                         typename ExecutionSpace::array_layout>;
 
-  using HostMirrorSpace = typename std::conditional<
-      !std::is_same<typename prop::HostMirrorSpace, void>::value,
+  using HostMirrorSpace = std::conditional_t<
+      !std::is_void<typename prop::HostMirrorSpace>::value,
       typename prop::HostMirrorSpace,
-      typename Kokkos::Impl::HostMirror<ExecutionSpace>::Space>::type;
+      typename Kokkos::Impl::HostMirror<ExecutionSpace>::Space>;
 
-  using MemoryTraits = typename std::conditional<
-      !std::is_same<typename prop::memory_traits, void>::value,
-      typename prop::memory_traits, typename Kokkos::MemoryManaged>::type;
+  using MemoryTraits =
+      std::conditional_t<!std::is_void<typename prop::memory_traits>::value,
+                         typename prop::memory_traits,
+                         typename Kokkos::MemoryManaged>;
+
+  using HooksPolicy =
+      std::conditional_t<!std::is_void<typename prop::hooks_policy>::value,
+                         typename prop::hooks_policy,
+                         Kokkos::Experimental::DefaultViewHooks>;
 
   // Analyze data type's properties,
   // May be specialized based upon the layout and value type
@@ -312,10 +337,10 @@ struct ViewTraits {
   using array_layout = ArrayLayout;
   using dimension    = typename data_analysis::dimension;
 
-  using specialize = typename std::conditional<
-      std::is_same<typename data_analysis::specialize, void>::value,
-      typename prop::specialize, typename data_analysis::specialize>::
-      type; /* mapping specialization tag */
+  using specialize = std::conditional_t<
+      std::is_void<typename data_analysis::specialize>::value,
+      typename prop::specialize,
+      typename data_analysis::specialize>; /* mapping specialization tag */
 
   enum { rank = dimension::rank };
   enum { rank_dynamic = dimension::rank_dynamic };
@@ -328,6 +353,7 @@ struct ViewTraits {
   using device_type       = Kokkos::Device<ExecutionSpace, MemorySpace>;
   using memory_traits     = MemoryTraits;
   using host_mirror_space = HostMirrorSpace;
+  using hooks_policy      = HooksPolicy;
 
   using size_type = typename MemorySpace::size_type;
 
@@ -445,9 +471,8 @@ struct is_always_assignable_impl<Kokkos::View<ViewTDst...>,
 
 template <class View1, class View2>
 using is_always_assignable = is_always_assignable_impl<
-    typename std::remove_reference<View1>::type,
-    typename std::remove_const<
-        typename std::remove_reference<View2>::type>::type>;
+    std::remove_reference_t<View1>,
+    std::remove_const_t<std::remove_reference_t<View2>>>;
 
 #ifdef KOKKOS_ENABLE_CXX17
 template <class T1, class T2>
@@ -589,6 +614,7 @@ class View : public ViewTraits<DataType, Properties...> {
       Kokkos::Impl::ViewMapping<traits, typename traits::specialize>;
   template <typename V>
   friend struct Kokkos::Impl::ViewTracker;
+  using hooks_policy = typename traits::hooks_policy;
 
   view_tracker_type m_track;
   map_type m_map;
@@ -598,28 +624,32 @@ class View : public ViewTraits<DataType, Properties...> {
   /** \brief  Compatible view of array of scalar types */
   using array_type =
       View<typename traits::scalar_array_type, typename traits::array_layout,
-           typename traits::device_type, typename traits::memory_traits>;
+           typename traits::device_type, typename traits::hooks_policy,
+           typename traits::memory_traits>;
 
   /** \brief  Compatible view of const data type */
   using const_type =
       View<typename traits::const_data_type, typename traits::array_layout,
-           typename traits::device_type, typename traits::memory_traits>;
+           typename traits::device_type, typename traits::hooks_policy,
+           typename traits::memory_traits>;
 
   /** \brief  Compatible view of non-const data type */
   using non_const_type =
       View<typename traits::non_const_data_type, typename traits::array_layout,
-           typename traits::device_type, typename traits::memory_traits>;
+           typename traits::device_type, typename traits::hooks_policy,
+           typename traits::memory_traits>;
 
   /** \brief  Compatible HostMirror view */
   using HostMirror =
       View<typename traits::non_const_data_type, typename traits::array_layout,
            Device<DefaultHostExecutionSpace,
-                  typename traits::host_mirror_space::memory_space>>;
+                  typename traits::host_mirror_space::memory_space>,
+           typename traits::hooks_policy>;
 
   /** \brief  Compatible HostMirror view */
   using host_mirror_type =
       View<typename traits::non_const_data_type, typename traits::array_layout,
-           typename traits::host_mirror_space>;
+           typename traits::host_mirror_space, typename traits::hooks_policy>;
 
   /** \brief Unified types */
   using uniform_type = typename Impl::ViewUniformType<View, 0>::type;
@@ -650,9 +680,9 @@ class View : public ViewTraits<DataType, Properties...> {
   // constexpr unsigned rank() { return map_type::Rank; }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      extent(const iType& r) const noexcept {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  extent(const iType& r) const noexcept {
     return m_map.extent(r);
   }
 
@@ -662,9 +692,9 @@ class View : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, int>::type
-      extent_int(const iType& r) const noexcept {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, int>
+  extent_int(const iType& r) const noexcept {
     return static_cast<int>(m_map.extent(r));
   }
 
@@ -710,9 +740,9 @@ class View : public ViewTraits<DataType, Properties...> {
   }
 
   template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      stride(iType r) const {
+  KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+      std::is_integral<iType>::value, size_t>
+  stride(iType r) const {
     return (
         r == 0
             ? m_map.stride_0()
@@ -783,440 +813,302 @@ class View : public ViewTraits<DataType, Properties...> {
       std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value;
 
   static constexpr bool is_default_map =
-      std::is_same<typename traits::specialize, void>::value &&
+      std::is_void<typename traits::specialize>::value &&
       (is_layout_left || is_layout_right || is_layout_stride);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
 
-#define KOKKOS_IMPL_SINK(ARG) ARG
-
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)                             \
-  Kokkos::Impl::runtime_check_memory_access_violation<                    \
-      typename traits::memory_space>(                                     \
-      "Kokkos::View ERROR: attempt to access inaccessible memory space"); \
-  Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space> ARG;
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...)                               \
+  Kokkos::Impl::runtime_check_memory_access_violation<                      \
+      typename traits::memory_space>(                                       \
+      "Kokkos::View ERROR: attempt to access inaccessible memory space",    \
+      __VA_ARGS__);                                                         \
+  Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space>( \
+      __VA_ARGS__);
 
 #else
 
-#define KOKKOS_IMPL_SINK(ARG)
-
-#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG)          \
-  Kokkos::Impl::runtime_check_memory_access_violation< \
-      typename traits::memory_space>(                  \
-      "Kokkos::View ERROR: attempt to access inaccessible memory space");
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...)                            \
+  Kokkos::Impl::runtime_check_memory_access_violation<                   \
+      typename traits::memory_space>(                                    \
+      "Kokkos::View ERROR: attempt to access inaccessible memory space", \
+      __VA_ARGS__);
 
 #endif
 
- public:
-  //------------------------------
-  // Rank 0 operator()
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  reference_type operator()() const { return m_map.reference(); }
-  //------------------------------
-  // Rank 1 operator()
+  template <typename... Is>
+  static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) {
+    static_assert(Rank <= sizeof...(Is), "");
+    static_assert(sizeof...(Is) <= 8, "");
+    static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
+  }
 
-  template <typename I0>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator()(const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
-    return m_map.reference(i0);
+  template <typename... Is>
+  static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) {
+    static_assert(Rank == sizeof...(Is), "");
+    static_assert(Kokkos::Impl::are_integral<Is...>::value, "");
   }
 
+ public:
+  //------------------------------
+  // Rank 1 default map operator()
+
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               !is_layout_stride),
-                              reference_type>::type
-      operator()(const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0>::value &&  //
+                        (1 == Rank) && is_default_map && !is_layout_stride),
+                       reference_type>
+      operator()(I0 i0) const {
+    check_operator_parens_valid_args(i0);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
     return m_map.m_impl_handle[i0];
   }
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
-      operator()(const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0>::value &&  //
+                        (1 == Rank) && is_default_map && is_layout_stride),
+                       reference_type>
+      operator()(I0 i0) const {
+    check_operator_parens_valid_args(i0);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
     return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0];
   }
+
   //------------------------------
   // Rank 1 operator[]
 
   template <typename I0>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator[](const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      ((1 == Rank) && Kokkos::Impl::are_integral<I0>::value && !is_default_map),
+      reference_type>
+  operator[](I0 i0) const {
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
     return m_map.reference(i0);
   }
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               !is_layout_stride),
-                              reference_type>::type
-      operator[](const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
+      std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral<I0>::value &&
+                        is_default_map && !is_layout_stride),
+                       reference_type>
+      operator[](I0 i0) const {
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
     return m_map.m_impl_handle[i0];
   }
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value &&
-                               (1 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
-      operator[](const I0& i0) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0))
+      std::enable_if_t<((1 == Rank) && Kokkos::Impl::are_integral<I0>::value &&
+                        is_default_map && is_layout_stride),
+                       reference_type>
+      operator[](I0 i0) const {
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0)
     return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0];
   }
 
   //------------------------------
-  // Rank 2
-
-  template <typename I0, typename I1>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
-    return m_map.reference(i0, i1);
-  }
+  // Rank 2 default map operator()
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_left && (traits::rank_dynamic == 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
+                        (2 == Rank) && is_default_map && is_layout_left &&
+                        (traits::rank_dynamic == 0)),
+                       reference_type>
+      operator()(I0 i0, I1 i1) const {
+    check_operator_parens_valid_args(i0, i1);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1];
   }
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_left && (traits::rank_dynamic != 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
+                        (2 == Rank) && is_default_map && is_layout_left &&
+                        (traits::rank_dynamic != 0)),
+                       reference_type>
+      operator()(I0 i0, I1 i1) const {
+    check_operator_parens_valid_args(i0, i1);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1];
   }
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_right && (traits::rank_dynamic == 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
+                        (2 == Rank) && is_default_map && is_layout_right &&
+                        (traits::rank_dynamic == 0)),
+                       reference_type>
+      operator()(I0 i0, I1 i1) const {
+    check_operator_parens_valid_args(i0, i1);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0];
   }
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_right && (traits::rank_dynamic != 0)),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
+                        (2 == Rank) && is_default_map && is_layout_right &&
+                        (traits::rank_dynamic != 0)),
+                       reference_type>
+      operator()(I0 i0, I1 i1) const {
+    check_operator_parens_valid_args(i0, i1);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0];
   }
 
   template <typename I0, typename I1>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value &&
-                               (2 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value &&  //
+                        (2 == Rank) && is_default_map && is_layout_stride),
+                       reference_type>
+      operator()(I0 i0, I1 i1) const {
+    check_operator_parens_valid_args(i0, i1);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1)
     return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 +
                                i1 * m_map.m_impl_offset.m_stride.S1];
   }
 
-  //------------------------------
-  // Rank 3
+  // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which
+  // have "inlined" versions above
 
-  template <typename I0, typename I1, typename I2>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
-                               (3 == Rank) && is_default_map),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1, const I2& i2) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2))
-    return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)];
+  template <typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<Is...>::value &&  //
+       (2 != Rank) && (1 != Rank) && (0 != Rank) && is_default_map),
+      reference_type>
+  operator()(Is... indices) const {
+    check_operator_parens_valid_args(indices...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...)
+    return m_map.m_impl_handle[m_map.m_impl_offset(indices...)];
   }
 
-  template <typename I0, typename I1, typename I2>
+  template <typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value &&
-                               (3 == Rank) && !is_default_map),
-                              reference_type>::type
-      operator()(const I0& i0, const I1& i1, const I2& i2) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2))
-    return m_map.reference(i0, i1, i2);
-  }
-
-  //------------------------------
-  // Rank 4
-
-  template <typename I0, typename I1, typename I2, typename I3>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) &&
-       is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3))
-    return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)];
-  }
-
-  template <typename I0, typename I1, typename I2, typename I3>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3))
-    return m_map.reference(i0, i1, i2, i3);
-  }
-
-  //------------------------------
-  // Rank 5
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) &&
-       is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4))
-    return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)];
-  }
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4))
-    return m_map.reference(i0, i1, i2, i3, i4);
-  }
-
-  //------------------------------
-  // Rank 6
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value &&
-       (6 == Rank) && is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4, i5))
-    return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)];
-  }
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value &&
-       (6 == Rank) && !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4, i5))
-    return m_map.reference(i0, i1, i2, i3, i4, i5);
+      std::enable_if_t<(Kokkos::Impl::always_true<Is...>::value &&  //
+                        ((0 == Rank) || !is_default_map)),
+                       reference_type>
+      operator()(Is... indices) const {
+    check_operator_parens_valid_args(indices...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...)
+    return m_map.reference(indices...);
   }
 
   //------------------------------
-  // Rank 7
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value &&
-       (7 == Rank) && is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5, const I6& i6) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6))
-    return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)];
-  }
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value &&
-       (7 == Rank) && !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5, const I6& i6) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6))
-    return m_map.reference(i0, i1, i2, i3, i4, i5, i6);
+  // Rank 0
+
+  template <typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<Is...>::value && (0 == Rank)), reference_type>
+  access(Is... extra) const {
+    check_access_member_function_valid_args(extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...)
+    return m_map.reference();
   }
 
   //------------------------------
-  // Rank 8
+  // Rank 1
 
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, typename I7>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value &&
-       (8 == Rank) && is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5, const I6& i6, const I7& i7) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7))
-    return m_map
-        .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)];
-  }
-
-  template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, typename I7>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value &&
-       (8 == Rank) && !is_default_map),
-      reference_type>::type
-  operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-             const I4& i4, const I5& i5, const I6& i6, const I7& i7) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7))
-    return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7);
-  }
-
-  template <class... Args>
+  template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<Args...>::value &&
-                               (0 == Rank)),
-                              reference_type>::type
-      access(Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, args...)))
-    return m_map.reference();
-  }
-
-  template <typename I0, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value &&
-                               (1 == Rank) && !is_default_map),
-                              reference_type>::type
-      access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, args...)))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
+                        (1 == Rank) && !is_default_map),
+                       reference_type>
+      access(I0 i0, Is... extra) const {
+    check_access_member_function_valid_args(i0, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...)
     return m_map.reference(i0);
   }
 
-  template <typename I0, class... Args>
+  template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value &&
-                               (1 == Rank) && is_default_map &&
-                               !is_layout_stride),
-                              reference_type>::type
-      access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, args...)))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
+                        (1 == Rank) && is_default_map && !is_layout_stride),
+                       reference_type>
+      access(I0 i0, Is... extra) const {
+    check_access_member_function_valid_args(i0, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...)
     return m_map.m_impl_handle[i0];
   }
 
-  template <typename I0, class... Args>
+  template <typename I0, typename... Is>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value &&
-                               (1 == Rank) && is_default_map &&
-                               is_layout_stride),
-                              reference_type>::type
-      access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, args...)))
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value &&
+                        (1 == Rank) && is_default_map && is_layout_stride),
+                       reference_type>
+      access(I0 i0, Is... extra) const {
+    check_access_member_function_valid_args(i0, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0];
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+  //------------------------------
+  // Rank 2
+
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value &&
+                        (2 == Rank) && !is_default_map),
+                       reference_type>
+      access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.reference(i0, i1);
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
        is_default_map && is_layout_left && (traits::rank_dynamic == 0)),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1];
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
        is_default_map && is_layout_left && (traits::rank_dynamic != 0)),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1];
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
        is_default_map && is_layout_right && (traits::rank_dynamic == 0)),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0];
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == Rank) &&
        is_default_map && is_layout_right && (traits::rank_dynamic != 0)),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0];
   }
 
-  template <typename I0, typename I1, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) &&
-       is_default_map && is_layout_stride),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...)))
+  template <typename I0, typename I1, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value &&
+                        (2 == Rank) && is_default_map && is_layout_stride),
+                       reference_type>
+      access(I0 i0, I1 i1, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...)
     return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 +
                                i1 * m_map.m_impl_offset.m_stride.S1];
   }
@@ -1224,54 +1116,50 @@ class View : public ViewTraits<DataType, Properties...> {
   //------------------------------
   // Rank 3
 
-  template <typename I0, typename I1, typename I2, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, Args...>::value && (3 == Rank) &&
-       is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, args...)))
+  template <typename I0, typename I1, typename I2, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value &&
+                        (3 == Rank) && is_default_map),
+                       reference_type>
+      access(I0 i0, I1 i1, I2 i2, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)];
   }
 
-  template <typename I0, typename I1, typename I2, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, Args...>::value && (3 == Rank) &&
-       !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, args...)))
+  template <typename I0, typename I1, typename I2, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value &&
+                        (3 == Rank) && !is_default_map),
+                       reference_type>
+      access(I0 i0, I1 i1, I2 i2, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...)
     return m_map.reference(i0, i1, i2);
   }
 
   //------------------------------
   // Rank 4
 
-  template <typename I0, typename I1, typename I2, typename I3, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, Args...>::value &&
-       (4 == Rank) && is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, args...)))
+  template <typename I0, typename I1, typename I2, typename I3, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == Rank) &&
+       is_default_map),
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)];
   }
 
-  template <typename I0, typename I1, typename I2, typename I3, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, Args...>::value &&
-       (4 == Rank) && !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, args...)))
+  template <typename I0, typename I1, typename I2, typename I3, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == Rank) &&
+       !is_default_map),
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...)
     return m_map.reference(i0, i1, i2, i3);
   }
 
@@ -1279,28 +1167,28 @@ class View : public ViewTraits<DataType, Properties...> {
   // Rank 5
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, Args...>::value &&
+            typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value &&
        (5 == Rank) && is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4,
+                                     extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)];
   }
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, Args...>::value &&
+            typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value &&
        (5 == Rank) && !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4,
+                                     extra...)
     return m_map.reference(i0, i1, i2, i3, i4);
   }
 
@@ -1308,28 +1196,28 @@ class View : public ViewTraits<DataType, Properties...> {
   // Rank 6
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, Args...>::value &&
+            typename I5, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value &&
        (6 == Rank) && is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5,
+                                     extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)];
   }
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, Args...>::value &&
+            typename I5, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value &&
        (6 == Rank) && !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5,
+                                     extra...)
     return m_map.reference(i0, i1, i2, i3, i4, i5);
   }
 
@@ -1337,28 +1225,30 @@ class View : public ViewTraits<DataType, Properties...> {
   // Rank 7
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, Args...>::value &&
+            typename I5, typename I6, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value &&
        (7 == Rank) && is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, const I6& i6, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, i6, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6,
+                                            extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6,
+                                     extra...)
     return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)];
   }
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, Args...>::value &&
+            typename I5, typename I6, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<
+      (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value &&
        (7 == Rank) && !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, const I6& i6, Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(
-        KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, i6, args...)))
+      reference_type>
+  access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6,
+                                            extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6,
+                                     extra...)
     return m_map.reference(i0, i1, i2, i3, i4, i5, i6);
   }
 
@@ -1366,33 +1256,35 @@ class View : public ViewTraits<DataType, Properties...> {
   // Rank 8
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, typename I7, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7,
-                                  Args...>::value &&
-       (8 == Rank) && is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, const I6& i6, const I7& i7,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(KOKKOS_IMPL_SINK(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7, args...)))
+            typename I5, typename I6, typename I7, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6,
+                                                  I7, Is...>::value &&
+                        (8 == Rank) && is_default_map),
+                       reference_type>
+      access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7,
+             Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7,
+                                            extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6,
+                                     i7, extra...)
     return m_map
         .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)];
   }
 
   template <typename I0, typename I1, typename I2, typename I3, typename I4,
-            typename I5, typename I6, typename I7, class... Args>
-  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
-      (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7,
-                                  Args...>::value &&
-       (8 == Rank) && !is_default_map),
-      reference_type>::type
-  access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4,
-         const I5& i5, const I6& i6, const I7& i7,
-         Args... KOKKOS_IMPL_SINK(args)) const {
-    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(KOKKOS_IMPL_SINK(
-        (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7, args...)))
+            typename I5, typename I6, typename I7, typename... Is>
+  KOKKOS_FORCEINLINE_FUNCTION
+      std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6,
+                                                  I7, Is...>::value &&
+                        (8 == Rank) && !is_default_map),
+                       reference_type>
+      access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7,
+             Is... extra) const {
+    check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7,
+                                            extra...);
+    KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6,
+                                     i7, extra...)
     return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7);
   }
 
@@ -1407,17 +1299,36 @@ class View : public ViewTraits<DataType, Properties...> {
   KOKKOS_DEFAULTED_FUNCTION
   View() = default;
 
-  KOKKOS_DEFAULTED_FUNCTION
-  View(const View&) = default;
+  KOKKOS_FUNCTION
+  View(const View& other) : m_track(other.m_track), m_map(other.m_map) {
+    KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);))
+  }
 
-  KOKKOS_DEFAULTED_FUNCTION
-  View(View&&) = default;
+  KOKKOS_FUNCTION
+  View(View&& other)
+      : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} {
+    KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);))
+  }
 
-  KOKKOS_DEFAULTED_FUNCTION
-  View& operator=(const View&) = default;
+  KOKKOS_FUNCTION
+  View& operator=(const View& other) {
+    m_map   = other.m_map;
+    m_track = other.m_track;
 
-  KOKKOS_DEFAULTED_FUNCTION
-  View& operator=(View&&) = default;
+    KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);))
+
+    return *this;
+  }
+
+  KOKKOS_FUNCTION
+  View& operator=(View&& other) {
+    m_map   = std::move(other.m_map);
+    m_track = std::move(other.m_track);
+
+    KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);))
+
+    return *this;
+  }
 
   //----------------------------------------
   // Compatible view copy constructor and assignment
@@ -1426,10 +1337,9 @@ class View : public ViewTraits<DataType, Properties...> {
   template <class RT, class... RP>
   KOKKOS_INLINE_FUNCTION View(
       const View<RT, RP...>& rhs,
-      typename std::enable_if<Kokkos::Impl::ViewMapping<
+      std::enable_if_t<Kokkos::Impl::ViewMapping<
           traits, typename View<RT, RP...>::traits,
-          typename traits::specialize>::is_assignable_data_type>::type* =
-          nullptr)
+          typename traits::specialize>::is_assignable_data_type>* = nullptr)
       : m_track(rhs), m_map() {
     using SrcTraits = typename View<RT, RP...>::traits;
     using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits,
@@ -1440,11 +1350,11 @@ class View : public ViewTraits<DataType, Properties...> {
   }
 
   template <class RT, class... RP>
-  KOKKOS_INLINE_FUNCTION typename std::enable_if<
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<
       Kokkos::Impl::ViewMapping<
           traits, typename View<RT, RP...>::traits,
           typename traits::specialize>::is_assignable_data_type,
-      View>::type&
+      View>&
   operator=(const View<RT, RP...>& rhs) {
     using SrcTraits = typename View<RT, RP...>::traits;
     using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits,
@@ -1489,15 +1399,19 @@ class View : public ViewTraits<DataType, Properties...> {
         .template get_label<typename traits::memory_space>();
   }
 
+ private:
+  enum class check_input_args : bool { yes = true, no = false };
+
+ public:
   //----------------------------------------
   // Allocation according to allocation properties and array layout
 
   template <class... P>
   explicit inline View(
       const Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout)
+      std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout,
+      check_input_args check_args = check_input_args::no)
       : m_track(), m_map() {
     // Append layout and spaces if not input
     using alloc_prop_input = Impl::ViewCtorProp<P...>;
@@ -1506,17 +1420,15 @@ class View : public ViewTraits<DataType, Properties...> {
     // to avoid duplicate class error.
     using alloc_prop = Impl::ViewCtorProp<
         P...,
-        typename std::conditional<alloc_prop_input::has_label,
-                                  std::integral_constant<unsigned int, 0>,
-                                  typename std::string>::type,
-        typename std::conditional<
-            alloc_prop_input::has_memory_space,
-            std::integral_constant<unsigned int, 1>,
-            typename traits::device_type::memory_space>::type,
-        typename std::conditional<
-            alloc_prop_input::has_execution_space,
-            std::integral_constant<unsigned int, 2>,
-            typename traits::device_type::execution_space>::type>;
+        std::conditional_t<alloc_prop_input::has_label,
+                           std::integral_constant<unsigned int, 0>,
+                           std::string>,
+        std::conditional_t<alloc_prop_input::has_memory_space,
+                           std::integral_constant<unsigned int, 1>,
+                           typename traits::device_type::memory_space>,
+        std::conditional_t<alloc_prop_input::has_execution_space,
+                           std::integral_constant<unsigned int, 2>,
+                           typename traits::device_type::execution_space>>;
 
     static_assert(traits::is_managed,
                   "View allocation constructor requires managed memory");
@@ -1533,6 +1445,26 @@ class View : public ViewTraits<DataType, Properties...> {
     // Copy the input allocation properties with possibly defaulted properties
     alloc_prop prop_copy(arg_prop);
 
+    if (check_args == check_input_args::yes) {
+      size_t i0 = arg_layout.dimension[0];
+      size_t i1 = arg_layout.dimension[1];
+      size_t i2 = arg_layout.dimension[2];
+      size_t i3 = arg_layout.dimension[3];
+      size_t i4 = arg_layout.dimension[4];
+      size_t i5 = arg_layout.dimension[5];
+      size_t i6 = arg_layout.dimension[6];
+      size_t i7 = arg_layout.dimension[7];
+
+      const std::string& alloc_name =
+          static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const&>(
+              prop_copy)
+              .value;
+      Impl::runtime_check_rank(
+          traits::rank, traits::rank_dynamic,
+          std::is_same<typename traits::specialize, void>::value, i0, i1, i2,
+          i3, i4, i5, i6, i7, alloc_name);
+    }
+
 //------------------------------------------------------------
 #if defined(KOKKOS_ENABLE_CUDA)
     // If allocating in CudaUVMSpace must fence before and after
@@ -1548,8 +1480,8 @@ class View : public ViewTraits<DataType, Properties...> {
 #endif
     //------------------------------------------------------------
 
-    Kokkos::Impl::SharedAllocationRecord<>* record =
-        m_map.allocate_shared(prop_copy, arg_layout);
+    Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared(
+        prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space);
 
 //------------------------------------------------------------
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -1575,9 +1507,9 @@ class View : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit KOKKOS_INLINE_FUNCTION View(
       const Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<Impl::ViewCtorProp<P...>::has_pointer,
-                              typename traits::array_layout>::type const&
-          arg_layout)
+      std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer,
+                       typename traits::array_layout> const& arg_layout,
+      check_input_args /*ignored*/ = check_input_args::no)  // Not checking
       : m_track()  // No memory tracking
         ,
         m_map(arg_prop, arg_layout) {
@@ -1592,9 +1524,8 @@ class View : public ViewTraits<DataType, Properties...> {
   template <class... P>
   explicit inline View(
       const Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer,
-                              size_t>::type const arg_N0 =
-          KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, size_t> const
+          arg_N0          = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1604,25 +1535,18 @@ class View : public ViewTraits<DataType, Properties...> {
       const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : View(arg_prop,
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
-                                           arg_N4, arg_N5, arg_N6, arg_N7)) {
-    KOKKOS_IF_ON_HOST(
-        (Impl::runtime_check_rank_host(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());))
-    KOKKOS_IF_ON_DEVICE(
-        (Impl::runtime_check_rank_device(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);))
+                                           arg_N4, arg_N5, arg_N6, arg_N7),
+             check_input_args::yes) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
   }
 
   template <class... P>
   explicit KOKKOS_INLINE_FUNCTION View(
       const Impl::ViewCtorProp<P...>& arg_prop,
-      typename std::enable_if<Impl::ViewCtorProp<P...>::has_pointer,
-                              size_t>::type const arg_N0 =
-          KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, size_t> const
+          arg_N0          = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1632,35 +1556,28 @@ class View : public ViewTraits<DataType, Properties...> {
       const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : View(arg_prop,
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
-                                           arg_N4, arg_N5, arg_N6, arg_N7)) {
-    KOKKOS_IF_ON_HOST(
-        (Impl::runtime_check_rank_host(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());))
-    KOKKOS_IF_ON_DEVICE(
-        (Impl::runtime_check_rank_device(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);))
+                                           arg_N4, arg_N5, arg_N6, arg_N7),
+             check_input_args::yes) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
   }
 
   // Allocate with label and layout
   template <typename Label>
   explicit inline View(
       const Label& arg_label,
-      typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value,
-                              typename traits::array_layout>::type const&
-          arg_layout)
-      : View(Impl::ViewCtorProp<std::string>(arg_label), arg_layout) {}
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value,
+                       typename traits::array_layout> const& arg_layout)
+      : View(Impl::ViewCtorProp<std::string>(arg_label), arg_layout,
+             check_input_args::yes) {}
 
   // Allocate label and layout, must disambiguate from subview constructor.
   template <typename Label>
   explicit inline View(
       const Label& arg_label,
-      typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value,
-                              const size_t>::type arg_N0 =
-          KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+      std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, const size_t>
+          arg_N0          = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
       const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@@ -1670,21 +1587,11 @@ class View : public ViewTraits<DataType, Properties...> {
       const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : View(Impl::ViewCtorProp<std::string>(arg_label),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
-                                           arg_N4, arg_N5, arg_N6, arg_N7)) {
+                                           arg_N4, arg_N5, arg_N6, arg_N7),
+             check_input_args::yes) {
     static_assert(traits::array_layout::is_extent_constructible,
-                  "Layout is not extent constructible. A layout object should "
-                  "be passed too.\n");
-
-    KOKKOS_IF_ON_HOST(
-        (Impl::runtime_check_rank_host(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());))
-    KOKKOS_IF_ON_DEVICE(
-        (Impl::runtime_check_rank_device(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);))
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
   }
 
   // Construct view from ViewTracker and map
@@ -1719,10 +1626,18 @@ class View : public ViewTraits<DataType, Properties...> {
 
   //----------------------------------------
   // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+      typename traits::array_layout const& layout) {
+    return map_type::memory_span(layout);
+  }
+
   static constexpr size_t required_allocation_size(
       const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0,
       const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0,
       const size_t arg_N6 = 0, const size_t arg_N7 = 0) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
     return map_type::memory_span(typename traits::array_layout(
         arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7));
   }
@@ -1738,17 +1653,11 @@ class View : public ViewTraits<DataType, Properties...> {
       const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
       : View(Impl::ViewCtorProp<pointer_type>(arg_ptr),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
-                                           arg_N4, arg_N5, arg_N6, arg_N7)) {
-    KOKKOS_IF_ON_HOST(
-        (Impl::runtime_check_rank_host(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());))
-    KOKKOS_IF_ON_DEVICE(
-        (Impl::runtime_check_rank_device(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);))
+                                           arg_N4, arg_N5, arg_N6, arg_N7),
+             check_input_args::yes) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
   }
 
   explicit KOKKOS_INLINE_FUNCTION View(
@@ -1758,23 +1667,22 @@ class View : public ViewTraits<DataType, Properties...> {
   //----------------------------------------
   // Shared scratch memory constructor
 
-  static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N1 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N2 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N3 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N4 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N5 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N6 = KOKKOS_INVALID_INDEX,
-                                  const size_t arg_N7 = KOKKOS_INVALID_INDEX) {
-    if (is_layout_stride) {
-      Kokkos::abort(
-          "Kokkos::View::shmem_size(extents...) doesn't work with "
-          "LayoutStride. Pass a LayoutStride object instead");
-    }
+  static KOKKOS_INLINE_FUNCTION size_t
+  shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N1 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N2 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N3 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N4 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N5 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N6 = KOKKOS_INVALID_INDEX,
+             const size_t arg_N7 = KOKKOS_INVALID_INDEX) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
     const size_t num_passed_args = Impl::count_valid_integers(
         arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);
 
-    if (std::is_same<typename traits::specialize, void>::value &&
+    if (std::is_void<typename traits::specialize>::value &&
         num_passed_args != traits::rank_dynamic) {
       Kokkos::abort(
           "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n");
@@ -1784,8 +1692,8 @@ class View : public ViewTraits<DataType, Properties...> {
         arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7));
   }
 
-  static inline size_t shmem_size(
-      typename traits::array_layout const& arg_layout) {
+  static KOKKOS_INLINE_FUNCTION size_t
+  shmem_size(typename traits::array_layout const& arg_layout) {
     return map_type::memory_span(arg_layout) +
            sizeof(typename traits::value_type);
   }
@@ -1816,17 +1724,11 @@ class View : public ViewTraits<DataType, Properties...> {
                          arg_N7)),
                      sizeof(typename traits::value_type)))),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
-                                           arg_N4, arg_N5, arg_N6, arg_N7)) {
-    KOKKOS_IF_ON_HOST(
-        (Impl::runtime_check_rank_host(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label());))
-    KOKKOS_IF_ON_DEVICE(
-        (Impl::runtime_check_rank_device(
-             traits::rank_dynamic,
-             std::is_same<typename traits::specialize, void>::value, arg_N0,
-             arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7);))
+                                           arg_N4, arg_N5, arg_N6, arg_N7),
+             check_input_args::yes) {
+    static_assert(traits::array_layout::is_extent_constructible,
+                  "Layout is not constructible from extent arguments. Use "
+                  "overload taking a layout object instead.");
   }
 };
 
@@ -1852,8 +1754,8 @@ struct RankDataType<ValueType, 0> {
 };
 
 template <unsigned N, typename... Args>
-std::enable_if_t<N == View<Args...>::Rank, View<Args...>> as_view_of_rank_n(
-    View<Args...> v) {
+KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
+as_view_of_rank_n(View<Args...> v) {
   return v;
 }
 
@@ -1982,7 +1884,7 @@ struct CommonViewValueType;
 
 template <typename A, typename B>
 struct CommonViewValueType<void, A, B> {
-  using value_type = typename std::common_type<A, B>::type;
+  using value_type = std::common_type_t<A, B>;
 };
 
 template <class Specialize, class ValueType>
@@ -2033,17 +1935,17 @@ struct DeduceCommonViewAllocProp<FirstView, NextViews...> {
   // if first and next specialize differ, but are not the same specialize, error
   // out
   static_assert(!(!std::is_same<first_specialize, next_specialize>::value &&
-                  !std::is_same<first_specialize, void>::value &&
-                  !std::is_same<void, next_specialize>::value),
+                  !std::is_void<first_specialize>::value &&
+                  !std::is_void<next_specialize>::value),
                 "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void "
                 "specialize trait allowed");
 
   // otherwise choose non-void specialize if either/both are non-void
-  using specialize = typename std::conditional<
+  using specialize = std::conditional_t<
       std::is_same<first_specialize, next_specialize>::value, first_specialize,
-      typename std::conditional<(std::is_same<first_specialize, void>::value &&
-                                 !std::is_same<next_specialize, void>::value),
-                                next_specialize, first_specialize>::type>::type;
+      std::conditional_t<(std::is_void<first_specialize>::value &&
+                          !std::is_void<next_specialize>::value),
+                         next_specialize, first_specialize>>;
 
   using value_type = typename CommonViewValueType<specialize, first_value_type,
                                                   next_value_type>::value_type;
@@ -2059,7 +1961,17 @@ template <class... Views>
 using DeducedCommonPropsType =
     typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type;
 
-// User function
+// This function is required in certain scenarios where users customize
+// Kokkos View internals. One example are dynamic length embedded ensemble
+// types. The function is used to propagate necessary information
+// (like the ensemble size) when creating new views.
+// However, most of the time it is called with a single view.
+// Furthermore, the propagated information is not just for view allocations.
+// From what I can tell, the type of functionality provided by
+// common_view_alloc_prop is the equivalent of propagating accessors in mdspan,
+// a mechanism we will eventually use to replace this clunky approach here, when
+// we are finally mdspan based.
+// TODO: get rid of this when we have mdspan
 template <class... Views>
 KOKKOS_INLINE_FUNCTION DeducedCommonPropsType<Views...> common_view_alloc_prop(
     Views const&... views) {
diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index dbb557c13..fafd825df 100644
--- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_WORKGRAPHPOLICY_HPP
 #define KOKKOS_WORKGRAPHPOLICY_HPP
 
@@ -243,7 +252,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 }  // namespace Kokkos
 
 #ifdef KOKKOS_ENABLE_SERIAL
-#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
+#include "Serial/Kokkos_Serial_WorkGraphPolicy.hpp"
 #endif
 
 #ifdef KOKKOS_ENABLE_OPENMP
diff --git a/packages/kokkos/core/src/Kokkos_hwloc.hpp b/packages/kokkos/core/src/Kokkos_hwloc.hpp
index 23fa0a0c6..abbec5409 100644
--- a/packages/kokkos/core/src/Kokkos_hwloc.hpp
+++ b/packages/kokkos/core/src/Kokkos_hwloc.hpp
@@ -42,6 +42,15 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
 #ifndef KOKKOS_HWLOC_HPP
 #define KOKKOS_HWLOC_HPP
 
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp
new file mode 100644
index 000000000..f3216095b
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp
@@ -0,0 +1,98 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACC_Instance.hpp>
+#include <impl/Kokkos_Profiling.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
+
+#include <ostream>
+
+Kokkos::Experimental::OpenACC::OpenACC()
+    : m_space_instance(Impl::OpenACCInternal::singleton()) {}
+
+void Kokkos::Experimental::OpenACC::impl_initialize(
+    InitializationSettings const& settings) {
+  Impl::OpenACCInternal::singleton()->initialize(settings);
+}
+
+void Kokkos::Experimental::OpenACC::impl_finalize() {
+  Impl::OpenACCInternal::singleton()->finalize();
+}
+
+bool Kokkos::Experimental::OpenACC::impl_is_initialized() {
+  return Impl::OpenACCInternal::singleton()->is_initialized();
+}
+
+void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os,
+                                                        bool verbose) const {
+  os << "macro KOKKOS_ENABLE_OPENACC is defined\n";  // FIXME_OPENACC
+  m_space_instance->print_configuration(os, verbose);
+}
+
+void Kokkos::Experimental::OpenACC::fence(std::string const& name) const {
+  Impl::OpenACCInternal::singleton()->fence(name);
+}
+
+void Kokkos::Experimental::OpenACC::impl_static_fence(std::string const& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::OpenACC>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { acc_wait_all(); });
+}
+
+uint32_t Kokkos::Experimental::OpenACC::impl_instance_id() const noexcept {
+  return m_space_instance->instance_id();
+}
+
+namespace Kokkos {
+namespace Impl {
+int g_openacc_space_factory_initialized =
+    initialize_space_factory<Experimental::OpenACC>("170_OpenACC");
+}  // namespace Impl
+}  // Namespace Kokkos
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
new file mode 100644
index 000000000..3ad59057b
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp
@@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
+
+#ifndef KOKKOS_OPENACC_HPP
+#define KOKKOS_OPENACC_HPP
+
+#include <OpenACC/Kokkos_OpenACCSpace.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <OpenACC/Kokkos_OpenACC_Traits.hpp>
+
+#include <openacc.h>
+
+#include <iosfwd>
+#include <string>
+
+namespace Kokkos::Experimental::Impl {
+class OpenACCInternal;
+}
+
+namespace Kokkos::Experimental {
+
+class OpenACC {
+  Impl::OpenACCInternal* m_space_instance = nullptr;
+
+ public:
+  using execution_space = OpenACC;
+  using memory_space    = OpenACCSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+
+  using array_layout = LayoutLeft;
+  using size_type    = memory_space::size_type;
+
+  using scratch_memory_space = ScratchMemorySpace<OpenACC>;
+
+  OpenACC();
+
+  static void impl_initialize(InitializationSettings const& settings);
+  static void impl_finalize();
+  static bool impl_is_initialized();
+
+  void print_configuration(std::ostream& os, bool verbose = false) const;
+
+  void fence(std::string const& name =
+                 "Kokkos::OpenACC::fence(): Unnamed Instance Fence") const;
+  static void impl_static_fence(std::string const& name);
+
+  static char const* name() { return "OpenACC"; }
+  static int concurrency() { return 256000; }  // FIXME_OPENACC
+  static bool in_parallel() { return acc_on_device(acc_device_not_host); }
+  uint32_t impl_instance_id() const noexcept;
+};
+
+}  // namespace Kokkos::Experimental
+
+template <>
+struct Kokkos::Tools::Experimental::DeviceTypeTraits<
+    ::Kokkos::Experimental::OpenACC> {
+  static constexpr DeviceType id =
+      ::Kokkos::Profiling::Experimental::DeviceType::OpenACC;
+  // FIXME_OPENACC: Need to return the device id from the execution space
+  // instance. In fact, acc_get_device_num() will return the same value as the
+  // device id from the execution space instance except for the host fallback
+  // case, where the device id may need to be updated with the value of
+  // acc_get_device_num().
+  static int device_id(const Kokkos::Experimental::OpenACC&) {
+    using Kokkos::Experimental::Impl::OpenACC_Traits;
+    return acc_get_device_num(OpenACC_Traits::dev_type);
+  }
+};
+
+#endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
new file mode 100644
index 000000000..bc2ba1815
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp
@@ -0,0 +1,222 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACCSpace.hpp>
+#include <impl/Kokkos_MemorySpace.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#include <openacc.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+void *Kokkos::Experimental::OpenACCSpace::allocate(
+    const Kokkos::Experimental::OpenACC &exec_space,
+    const size_t arg_alloc_size) const {
+  return allocate(exec_space, "[unlabeled]", arg_alloc_size);
+}
+
+void *Kokkos::Experimental::OpenACCSpace::allocate(
+    const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+
+void *Kokkos::Experimental::OpenACCSpace::allocate(
+    const Kokkos::Experimental::OpenACC &exec_space, const char *arg_label,
+    const size_t arg_alloc_size, const size_t arg_logical_size) const {
+  return impl_allocate(exec_space, arg_label, arg_alloc_size, arg_logical_size);
+}
+
+void *Kokkos::Experimental::OpenACCSpace::allocate(
+    const char *arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size) const {
+  return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
+}
+
+void *Kokkos::Experimental::OpenACCSpace::impl_allocate(
+    const Kokkos::Experimental::OpenACC &exec_space, const char *arg_label,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  static_assert(sizeof(void *) == sizeof(uintptr_t),
+                "Error sizeof(void*) != sizeof(uintptr_t)");
+
+  void *ptr = nullptr;
+
+  // FIXME_OPENACC multiple device instances are not yet supported, and thus
+  // exec_space is ignored for now.
+  (void)exec_space;
+
+  ptr = acc_malloc(arg_alloc_size);
+
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
+  }
+
+  return ptr;
+}
+
+void *Kokkos::Experimental::OpenACCSpace::impl_allocate(
+    const char *arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  static_assert(sizeof(void *) == sizeof(uintptr_t),
+                "Error sizeof(void*) != sizeof(uintptr_t)");
+
+  void *ptr = nullptr;
+
+  //[DEBUG] Disabled due to the synchronous behavior of the current
+  // implementation.
+  /*
+    OpenACC::impl_static_fence(
+        "Kokkos::OpenACCSpace::impl_allocate: Pre OpenACC Allocation");
+  */
+
+  ptr = acc_malloc(arg_alloc_size);
+
+  //[DEBUG] Disabled due to the synchronous behavior of the current
+  // implementation.
+  /*
+    OpenACC::impl_static_fence(
+        "Kokkos::OpenACCSpace::impl_allocate: Post OpenACC Allocation");
+  */
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
+  }
+
+  return ptr;
+}
+
+void Kokkos::Experimental::OpenACCSpace::deallocate(
+    void *const arg_alloc_ptr, const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void Kokkos::Experimental::OpenACCSpace::deallocate(
+    const char *arg_label, void *const arg_alloc_ptr,
+    const size_t arg_alloc_size, const size_t arg_logical_size) const {
+  impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
+}
+
+void Kokkos::Experimental::OpenACCSpace::impl_deallocate(
+    const char *arg_label, void *const arg_alloc_ptr,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
+                                      reported_size);
+  }
+
+  if (arg_alloc_ptr) {
+    acc_free(arg_alloc_ptr);
+  }
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_ENABLE_DEBUG
+Kokkos::Impl::SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::OpenACCSpace, void>::s_root_record;
+#endif
+
+Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
+                                     void>::~SharedAllocationRecord() {
+  m_space.deallocate(m_label.c_str(),
+                     SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     (SharedAllocationRecord<void, void>::m_alloc_size -
+                      sizeof(SharedAllocationHeader)));
+}
+
+Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::OpenACCSpace &arg_space,
+        const std::string &arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_space(arg_space) {
+  SharedAllocationHeader header;
+
+  this->base_t::_fill_host_accessible_header_info(header, arg_label);
+
+  Kokkos::Impl::DeepCopy<Experimental::OpenACCSpace, HostSpace>(
+      RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader));
+  Kokkos::fence(
+      "SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, "
+      "void>::SharedAllocationRecord(): fence after copying header from "
+      "HostSpace");
+}
+
+//==============================================================================
+// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1
+
+#include <impl/Kokkos_SharedAlloc_timpl.hpp>
+
+// To avoid additional compilation cost for something that's (mostly?) not
+// performance sensitive, we explicitly instantiate these CRTP base classes
+// here, where we have access to the associated *_timpl.hpp header files.
+template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon<
+    Kokkos::Experimental::OpenACCSpace>;
+template class Kokkos::Impl::SharedAllocationRecordCommon<
+    Kokkos::Experimental::OpenACCSpace>;
+
+// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1
+//==============================================================================
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp
new file mode 100644
index 000000000..a7347e8f9
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp
@@ -0,0 +1,249 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#include <Kokkos_Macros.hpp>
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+static_assert(false,
+              "Including non-public Kokkos header files is not allowed.");
+#else
+KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
+#endif
+#endif
+
+#ifndef KOKKOS_OPENACC_SPACE_HPP
+#define KOKKOS_OPENACC_SPACE_HPP
+
+#include <Kokkos_Concepts.hpp>
+
+#include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+#include <openacc.h>
+#include <iosfwd>
+
+namespace Kokkos::Experimental {
+
+class OpenACC;
+
+class OpenACCSpace {
+ public:
+  using memory_space    = OpenACCSpace;
+  using execution_space = OpenACC;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+
+  using size_type = size_t;
+
+  OpenACCSpace() = default;
+
+  /**\brief  Allocate untracked memory in the space */
+  void* allocate(const Kokkos::Experimental::OpenACC& exec_space,
+                 const size_t arg_alloc_size) const;
+  void* allocate(const Kokkos::Experimental::OpenACC& exec_space,
+                 const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+  void* allocate(const size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+  static constexpr char const* name() { return "OpenACCSpace"; }
+
+ private:
+  void* impl_allocate(const Kokkos::Experimental::OpenACC& exec_space,
+                      const char* arg_label, const size_t arg_alloc_size,
+                      const size_t arg_logical_size = 0,
+                      const Kokkos::Tools::SpaceHandle =
+                          Kokkos::Tools::make_space_handle(name())) const;
+  void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
+                      const size_t arg_logical_size = 0,
+                      const Kokkos::Tools::SpaceHandle =
+                          Kokkos::Tools::make_space_handle(name())) const;
+  void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                       const size_t arg_alloc_size,
+                       const size_t arg_logical_size = 0,
+                       const Kokkos::Tools::SpaceHandle =
+                           Kokkos::Tools::make_space_handle(name())) const;
+};
+
+}  // namespace Kokkos::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+template <>
+struct Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                       Kokkos::Experimental::OpenACCSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                                       Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace,
+                                       Kokkos::Experimental::OpenACCSpace> {
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+/*--------------------------------------------------------------------------*/
+
+template <>
+class Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace,
+                                           void>
+    : public HostInaccessibleSharedAllocationRecordCommon<
+          Kokkos::Experimental::OpenACCSpace> {
+ private:
+  friend class HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::OpenACCSpace>;
+  friend class SharedAllocationRecordCommon<Kokkos::Experimental::OpenACCSpace>;
+  friend Kokkos::Experimental::OpenACCSpace;
+
+  using base_t = HostInaccessibleSharedAllocationRecordCommon<
+      Kokkos::Experimental::OpenACCSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+
+  /**\brief  Root record for tracked allocations from this OpenACCSpace
+   * instance */
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::OpenACCSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default;
+
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(
+      const ExecutionSpace& /*exec_space*/,
+      const Kokkos::Experimental::OpenACCSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &deallocate)
+      : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size,
+                               arg_dealloc) {}
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::OpenACCSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &deallocate);
+
+ public:
+  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
+      const Kokkos::Experimental::OpenACCSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size) {
+    if (acc_on_device(acc_device_host)) {
+      return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+// FIXME_OPENACC: Need to update the DeepCopy implementations below to support
+// multiple execution space instances.
+// The current OpenACC backend implementation assumes that there is only one
+// device execution space instance, and all the device operations (e.g., memory
+// transfers, kernel launches, etc.) are implemented to be synchronous, which
+// does not violate the Kokkos execution semantics with the single execution
+// space instance.
+template <class ExecutionSpace>
+struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
+                              Kokkos::Experimental::OpenACCSpace,
+                              ExecutionSpace> {
+  DeepCopy(void* dst, const void* src, size_t n) {
+    // The behavior of acc_memcpy_device when bytes argument is zero is
+    // clarified only in the latest OpenACC specification (V3.2), and thus the
+    // value checking is added as a safeguard. (The current NVHPC (V22.5)
+    // supports OpenACC V2.7.)
+    if (n > 0) acc_memcpy_device(dst, const_cast<void*>(src), n);
+  }
+  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
+    exec.fence();
+    if (n > 0) acc_memcpy_device(dst, const_cast<void*>(src), n);
+  }
+};
+
+template <class ExecutionSpace>
+struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace,
+                              Kokkos::HostSpace, ExecutionSpace> {
+  DeepCopy(void* dst, const void* src, size_t n) {
+    if (n > 0) acc_memcpy_to_device(dst, const_cast<void*>(src), n);
+  }
+  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
+    exec.fence();
+    if (n > 0) acc_memcpy_to_device(dst, const_cast<void*>(src), n);
+  }
+};
+
+template <class ExecutionSpace>
+struct Kokkos::Impl::DeepCopy<
+    Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> {
+  DeepCopy(void* dst, const void* src, size_t n) {
+    if (n > 0) acc_memcpy_from_device(dst, const_cast<void*>(src), n);
+  }
+  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
+    exec.fence();
+    if (n > 0) acc_memcpy_from_device(dst, const_cast<void*>(src), n);
+  }
+};
+
+#endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp
new file mode 100644
index 000000000..15d38803f
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+
+#include <OpenACC/Kokkos_OpenACC_Instance.hpp>
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACC_Traits.hpp>
+#include <impl/Kokkos_Profiling.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+
+#include <openacc.h>
+
+#include <iostream>
+
+namespace Kokkos {
+bool show_warnings() noexcept;
+}
+
+Kokkos::Experimental::Impl::OpenACCInternal*
+Kokkos::Experimental::Impl::OpenACCInternal::singleton() {
+  static OpenACCInternal self;
+  return &self;
+}
+
+void Kokkos::Experimental::Impl::OpenACCInternal::initialize(
+    InitializationSettings const& settings) {
+  if (OpenACC_Traits::may_fallback_to_host &&
+      acc_get_num_devices(OpenACC_Traits::dev_type) == 0 &&
+      !settings.has_device_id()) {
+    if (show_warnings()) {
+      std::cerr << "Warning: No GPU available for execution, falling back to"
+                   " using the host!"
+                << std::endl;
+    }
+    acc_set_device_type(acc_device_host);
+    // FIXME_OPENACC if multiple execution space instances are supported,
+    // device id variable should be explicitly set to the value returned by
+    // acc_get_device_num(acc_device_host).
+  } else {
+    using Kokkos::Impl::get_gpu;
+    int const dev_num = get_gpu(settings);
+    acc_set_device_num(dev_num, OpenACC_Traits::dev_type);
+  }
+  m_is_initialized = true;
+}
+
+void Kokkos::Experimental::Impl::OpenACCInternal::finalize() {
+  m_is_initialized = false;
+}
+
+bool Kokkos::Experimental::Impl::OpenACCInternal::is_initialized() const {
+  return m_is_initialized;
+}
+
+void Kokkos::Experimental::Impl::OpenACCInternal::print_configuration(
+    std::ostream& os, bool /*verbose*/) const {
+  os << "Using OpenACC\n";  // FIXME_OPENACC
+}
+
+void Kokkos::Experimental::Impl::OpenACCInternal::fence(
+    std::string const& name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::OpenACC>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id()},
+      [&]() {
+        //[DEBUG] disabled due to synchronous behaviors of the current
+        // parallel construct implementations. acc_wait_all();
+      });
+}
+
+uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const
+    noexcept {
+  return Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::OpenACC>(reinterpret_cast<uintptr_t>(this));
+}
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
new file mode 100644
index 000000000..cb69b4ae7
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp
@@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENACC_INSTANCE_HPP
+#define KOKKOS_OPENACC_INSTANCE_HPP
+
+#include <impl/Kokkos_InitializationSettings.hpp>
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+namespace Kokkos::Experimental::Impl {
+
+class OpenACCInternal {
+  bool m_is_initialized = false;
+
+  OpenACCInternal()                       = default;
+  OpenACCInternal(const OpenACCInternal&) = default;
+  OpenACCInternal& operator=(const OpenACCInternal&) = default;
+
+ public:
+  static OpenACCInternal* singleton();
+
+  void initialize(InitializationSettings const& settings);
+  void finalize();
+  bool is_initialized() const;
+
+  void print_configuration(std::ostream& os, bool verbose = false) const;
+
+  void fence(std::string const& name) const;
+
+  uint32_t instance_id() const noexcept;
+};
+
+}  // namespace Kokkos::Experimental::Impl
+
+#endif
diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
new file mode 100644
index 000000000..f9451ecfe
--- /dev/null
+++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENACC_TRAITS_HPP
+#define KOKKOS_OPENACC_TRAITS_HPP
+
+#include <openacc.h>
+
+namespace Kokkos::Experimental::Impl {
+
+struct OpenACC_Traits {
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_AMPERE)
+  static constexpr acc_device_t dev_type     = acc_device_nvidia;
+  static constexpr bool may_fallback_to_host = false;
+#else
+  static constexpr acc_device_t dev_type     = acc_device_not_host;
+  static constexpr bool may_fallback_to_host = true;
+#endif
+};
+
+}  // namespace Kokkos::Experimental::Impl
+
+#endif
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
similarity index 72%
rename from packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
rename to packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
index 66dbbacce..2397aa478 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
@@ -42,8 +42,9 @@
 //@HEADER
 */
 
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_OPENMP)
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
 
 #include <cstdio>
 #include <cstdlib>
@@ -57,19 +58,21 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
 int g_openmp_hardware_max_threads = 1;
 
-__thread int t_openmp_hardware_id            = 0;
-__thread Impl::OpenMPExec *t_openmp_instance = nullptr;
+thread_local int t_openmp_hardware_id = 0;
+// FIXME_OPENMP we can remove this after we remove partition_master
+thread_local OpenMPInternal *t_openmp_instance = nullptr;
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-void OpenMPExec::validate_partition_impl(const int nthreads,
-                                         int &num_partitions,
-                                         int &partition_size) {
+void OpenMPInternal::validate_partition_impl(const int nthreads,
+                                             int &num_partitions,
+                                             int &partition_size) {
   if (nthreads == 1) {
     num_partitions = 1;
     partition_size = 1;
@@ -121,24 +124,7 @@ void OpenMPExec::validate_partition_impl(const int nthreads,
 }
 #endif
 
-void OpenMPExec::verify_is_master(const char *const label) {
-  if (!t_openmp_instance) {
-    std::string msg(label);
-    msg.append(" ERROR: in parallel or not initialized");
-    Kokkos::Impl::throw_runtime_exception(msg);
-  }
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void OpenMPExec::clear_thread_data() {
+void OpenMPInternal::clear_thread_data() {
   const size_t member_bytes =
       sizeof(int64_t) *
       HostThreadTeamData::align_to_int64(sizeof(HostThreadTeamData));
@@ -163,10 +149,10 @@ void OpenMPExec::clear_thread_data() {
   /* END #pragma omp parallel */
 }
 
-void OpenMPExec::resize_thread_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes) {
+void OpenMPInternal::resize_thread_data(size_t pool_reduce_bytes,
+                                        size_t team_reduce_bytes,
+                                        size_t team_shared_bytes,
+                                        size_t thread_local_bytes) {
   const size_t member_bytes =
       sizeof(int64_t) *
       HostThreadTeamData::align_to_int64(sizeof(HostThreadTeamData));
@@ -243,17 +229,16 @@ void OpenMPExec::resize_thread_data(size_t pool_reduce_bytes,
   }
 }
 
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
+OpenMPInternal &OpenMPInternal::singleton() {
+  static OpenMPInternal *self = nullptr;
+  if (self == nullptr) {
+    self = new OpenMPInternal(get_current_max_threads());
+  }
 
-//----------------------------------------------------------------------------
+  return *self;
+}
 
-int OpenMP::impl_get_current_max_threads() noexcept {
+int OpenMPInternal::get_current_max_threads() noexcept {
   // Using omp_get_max_threads(); is problematic in conjunction with
   // Hwloc on Intel (essentially an initial call to the OpenMP runtime
   // without a parallel region before will set a process mask for a single core
@@ -272,16 +257,17 @@ int OpenMP::impl_get_current_max_threads() noexcept {
   return count;
 }
 
-void OpenMP::impl_initialize(int thread_count) {
+void OpenMPInternal::initialize(int thread_count) {
+  if (m_initialized) {
+    Kokkos::abort(
+        "Calling OpenMP::initialize after OpenMP::finalize is illegal\n");
+  }
+
   if (omp_in_parallel()) {
     std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 
-  if (Impl::t_openmp_instance) {
-    finalize();
-  }
-
   {
     if (Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND")) {
       printf(
@@ -299,7 +285,7 @@ void OpenMP::impl_initialize(int thread_count) {
     // Before any other call to OMP query the maximum number of threads
     // and save the value for re-initialization unit testing.
 
-    Impl::g_openmp_hardware_max_threads = impl_get_current_max_threads();
+    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
 
     int process_num_threads = Impl::g_openmp_hardware_max_threads;
 
@@ -335,21 +321,12 @@ void OpenMP::impl_initialize(int thread_count) {
 // setup thread local
 #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
     {
-      Impl::t_openmp_instance    = nullptr;
       Impl::t_openmp_hardware_id = omp_get_thread_num();
       Impl::SharedAllocationRecord<void, void>::tracking_enable();
     }
 
-    void *ptr = nullptr;
-    try {
-      ptr = space.allocate(sizeof(Impl::OpenMPExec));
-    } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &f) {
-      // For now, just rethrow the error message the existing way
-      Kokkos::Impl::throw_runtime_exception(f.get_error_message());
-    }
-
-    Impl::t_openmp_instance =
-        new (ptr) Impl::OpenMPExec(Impl::g_openmp_hardware_max_threads);
+    auto &instance       = OpenMPInternal::singleton();
+    instance.m_pool_size = Impl::g_openmp_hardware_max_threads;
 
     // New, unified host thread team data:
     {
@@ -358,9 +335,8 @@ void OpenMP::impl_initialize(int thread_count) {
       size_t team_shared_bytes  = 1024 * thread_count;
       size_t thread_local_bytes = 1024;
 
-      Impl::t_openmp_instance->resize_thread_data(
-          pool_reduce_bytes, team_reduce_bytes, team_shared_bytes,
-          thread_local_bytes);
+      instance.resize_thread_data(pool_reduce_bytes, team_reduce_bytes,
+                                  team_shared_bytes, thread_local_bytes);
     }
   }
 
@@ -380,38 +356,31 @@ void OpenMP::impl_initialize(int thread_count) {
               << thread_count << " threads per process." << std::endl;
   }
   // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-}
+  init_lock_array_host_space();
 
-//----------------------------------------------------------------------------
+  m_initialized = true;
+}
 
-void OpenMP::impl_finalize() {
+void OpenMPInternal::finalize() {
   if (omp_in_parallel()) {
     std::string msg("Kokkos::OpenMP::finalize ERROR ");
-    if (!Impl::t_openmp_instance) msg.append(": not initialized");
+    if (this != &singleton()) msg.append(": not initialized");
     if (omp_in_parallel()) msg.append(": in parallel");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 
-  if (Impl::t_openmp_instance) {
+  if (this == &singleton()) {
+    auto const &instance = singleton();
     // Silence Cuda Warning
-    const int nthreads = Impl::t_openmp_instance->m_pool_size <=
-                                 Impl::g_openmp_hardware_max_threads
-                             ? Impl::g_openmp_hardware_max_threads
-                             : Impl::t_openmp_instance->m_pool_size;
+    const int nthreads =
+        instance.m_pool_size <= Impl::g_openmp_hardware_max_threads
+            ? Impl::g_openmp_hardware_max_threads
+            : instance.m_pool_size;
     (void)nthreads;
 
-    using Exec     = Impl::OpenMPExec;
-    Exec *instance = Impl::t_openmp_instance;
-    instance->~Exec();
-
-    OpenMP::memory_space space;
-    space.deallocate(instance, sizeof(Exec));
-
 #pragma omp parallel num_threads(nthreads)
     {
       Impl::t_openmp_hardware_id = 0;
-      Impl::t_openmp_instance    = nullptr;
       Impl::SharedAllocationRecord<void, void>::tracking_disable();
     }
 
@@ -421,19 +390,15 @@ void OpenMP::impl_finalize() {
     Impl::g_openmp_hardware_max_threads = 1;
   }
 
+  m_initialized = false;
+
   Kokkos::Profiling::finalize();
 }
 
-//----------------------------------------------------------------------------
-
-void OpenMP::print_configuration(std::ostream &s, const bool /*verbose*/) {
+void OpenMPInternal::print_configuration(std::ostream &s) const {
   s << "Kokkos::OpenMP";
 
-  const bool is_initialized = Impl::t_openmp_instance != nullptr;
-
-  if (is_initialized) {
-    Impl::OpenMPExec::verify_is_master("OpenMP::print_configuration");
-
+  if (m_initialized) {
     const int numa_count      = 1;
     const int core_per_numa   = Impl::g_openmp_hardware_max_threads;
     const int thread_per_core = 1;
@@ -445,67 +410,75 @@ void OpenMP::print_configuration(std::ostream &s, const bool /*verbose*/) {
   }
 }
 
-std::vector<OpenMP> OpenMP::partition(...) { return std::vector<OpenMP>(1); }
-
-OpenMP OpenMP::create_instance(...) { return OpenMP(); }
+bool OpenMPInternal::verify_is_initialized(const char *const label) const {
+  if (!m_initialized) {
+    std::cerr << "Kokkos::OpenMP " << label
+              << " : ERROR OpenMP is not initialized" << std::endl;
+  }
+  return m_initialized;
+}
+}  // namespace Impl
 
-int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
+//----------------------------------------------------------------------------
 
-void OpenMP::fence() const {
-  fence("Kokkos::OpenMP::fence: Unnamed Instance Fence");
+OpenMP::OpenMP()
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    : m_space_instance(&Impl::OpenMPInternal::singleton()) {
 }
-void OpenMP::fence(const std::string &name) const {
-  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
-      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+#else
+    : m_space_instance(&Impl::OpenMPInternal::singleton(),
+                       [](Impl::OpenMPInternal *) {}) {
+  Impl::OpenMPInternal::singleton().verify_is_initialized(
+      "OpenMP instance constructor");
 }
+#endif
 
-namespace Impl {
-
-int g_openmp_space_factory_initialized =
-    initialize_space_factory<OpenMPSpaceInitializer>("050_OpenMP");
-
-void OpenMPSpaceInitializer::initialize(const InitArguments &args) {
-  // Prevent "unused variable" warning for 'args' input struct.  If
-  // Serial::initialize() ever needs to take arguments from the input
-  // struct, you may remove this line of code.
-  const int num_threads = args.num_threads;
-
-  if (std::is_same<Kokkos::OpenMP, Kokkos::DefaultExecutionSpace>::value ||
-      std::is_same<Kokkos::OpenMP, Kokkos::HostSpace::execution_space>::value) {
-    Kokkos::OpenMP::impl_initialize(num_threads);
-  } else {
-    // std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not
-    // initialized" << std::endl ;
-  }
+int OpenMP::impl_get_current_max_threads() noexcept {
+  return Impl::OpenMPInternal::get_current_max_threads();
 }
 
-void OpenMPSpaceInitializer::finalize(const bool) {
-  if (Kokkos::OpenMP::impl_is_initialized()) Kokkos::OpenMP::impl_finalize();
+void OpenMP::impl_initialize(InitializationSettings const &settings) {
+  Impl::OpenMPInternal::singleton().initialize(
+      settings.has_num_threads() ? settings.get_num_threads() : -1);
 }
 
-void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); }
-void OpenMPSpaceInitializer::fence(const std::string &name) {
-  Kokkos::OpenMP::impl_static_fence(OpenMP(), name);
-}
+void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); }
 
-void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
-                                                 const bool detail) {
-  msg << "Host Parallel Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_OPENMP: ";
-  msg << "yes" << std::endl;
+void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const {
+  os << "Host Parallel Execution Space:\n";
+  os << "  KOKKOS_ENABLE_OPENMP: yes\n";
 
-  msg << "OpenMP Atomics:" << std::endl;
-  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+  os << "OpenMP Atomics:\n";
+  os << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
 #ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
-  msg << "yes" << std::endl;
+  os << "yes\n";
 #else
-  msg << "no" << std::endl;
+  os << "no\n";
 #endif
 
-  msg << "\nOpenMP Runtime Configuration:" << std::endl;
-  OpenMP::print_configuration(msg, detail);
+  os << "\nOpenMP Runtime Configuration:\n";
+
+  m_space_instance->print_configuration(os);
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+std::vector<OpenMP> OpenMP::partition(...) { return std::vector<OpenMP>(1); }
+
+OpenMP OpenMP::create_instance(...) { return OpenMP(); }
+#endif
+
+int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
+
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
+
+namespace Impl {
+
+int g_openmp_space_factory_initialized =
+    initialize_space_factory<OpenMP>("050_OpenMP");
+
 }  // namespace Impl
 
 #ifdef KOKKOS_ENABLE_CXX14
@@ -517,7 +490,3 @@ constexpr DeviceType DeviceTypeTraits<OpenMP>::id;
 #endif
 
 }  // namespace Kokkos
-
-#else
-void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
-#endif  // KOKKOS_ENABLE_OPENMP
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
similarity index 80%
rename from packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
rename to packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
index ede24d109..1a2ee95a7 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@@ -42,12 +42,10 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_OPENMPEXEC_HPP
-#define KOKKOS_OPENMPEXEC_HPP
+#ifndef KOKKOS_OPENMP_INSTANCE_HPP
+#define KOKKOS_OPENMP_INSTANCE_HPP
 
 #include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_OPENMP)
-
 #if !defined(_OPENMP) && !defined(__CUDA_ARCH__) && \
     !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 #error \
@@ -66,27 +64,45 @@
 
 #include <omp.h>
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 namespace Kokkos {
 namespace Impl {
 
-class OpenMPExec;
+class OpenMPInternal;
 
 extern int g_openmp_hardware_max_threads;
 
-extern __thread int t_openmp_hardware_id;
-extern __thread OpenMPExec* t_openmp_instance;
+extern thread_local int t_openmp_hardware_id;
+// FIXME_OPENMP we can remove this after we remove partition_master
+extern thread_local OpenMPInternal* t_openmp_instance;
+
+struct OpenMPTraits {
+  static int constexpr MAX_THREAD_COUNT = 512;
+};
+
+class OpenMPInternal {
+ private:
+  OpenMPInternal(int arg_pool_size)
+      : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() {}
+
+  ~OpenMPInternal() { clear_thread_data(); }
+
+  static int get_current_max_threads() noexcept;
 
-//----------------------------------------------------------------------------
-/** \brief  Data for OpenMP thread execution */
+  bool m_initialized = false;
+
+  int m_pool_size;
+  int m_level;
+
+  HostThreadTeamData* m_pool[OpenMPTraits::MAX_THREAD_COUNT];
 
-class OpenMPExec {
  public:
   friend class Kokkos::OpenMP;
 
-  enum { MAX_THREAD_COUNT = 512 };
+  static OpenMPInternal& singleton();
+
+  void initialize(int thread_cound);
+
+  void finalize();
 
   void clear_thread_data();
 
@@ -100,65 +116,58 @@ class OpenMPExec {
                                       int& partition_size);
 #endif
 
- private:
-  OpenMPExec(int arg_pool_size)
-      : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() {}
-
-  ~OpenMPExec() { clear_thread_data(); }
-
-  int m_pool_size;
-  int m_level;
-
-  HostThreadTeamData* m_pool[MAX_THREAD_COUNT];
-
- public:
-  static void verify_is_master(const char* const);
-
   void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes,
                           size_t team_shared_bytes, size_t thread_local_bytes);
 
-  inline HostThreadTeamData* get_thread_data() const noexcept {
+  HostThreadTeamData* get_thread_data() const noexcept {
     return m_pool[m_level == omp_get_level() ? 0 : omp_get_thread_num()];
   }
 
-  inline HostThreadTeamData* get_thread_data(int i) const noexcept {
+  HostThreadTeamData* get_thread_data(int i) const noexcept {
     return m_pool[i];
   }
-};
 
-}  // namespace Impl
-}  // namespace Kokkos
+  bool is_initialized() const { return m_initialized; }
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+  bool verify_is_initialized(const char* const label) const;
 
-namespace Kokkos {
+  void print_configuration(std::ostream& s) const;
+};
 
+}  // namespace Impl
 inline bool OpenMP::impl_is_initialized() noexcept {
-  return Impl::t_openmp_instance != nullptr;
+  return Impl::OpenMPInternal::singleton().is_initialized();
 }
 
 inline bool OpenMP::in_parallel(OpenMP const&) noexcept {
-  // t_openmp_instance is only non-null on a master thread
-  return !Impl::t_openmp_instance ||
-         Impl::t_openmp_instance->m_level < omp_get_level();
+  // FIXME_OPENMP We are forced to use t_openmp_instance because the function is
+  // static and does not use the OpenMP object
+  return ((Impl::OpenMPInternal::singleton().m_level < omp_get_level()) &&
+          (!Impl::t_openmp_instance ||
+           Impl::t_openmp_instance->m_level < omp_get_level()));
 }
 
 inline int OpenMP::impl_thread_pool_size() noexcept {
-  return OpenMP::in_parallel() ? omp_get_num_threads()
-                               : Impl::t_openmp_instance->m_pool_size;
+  // FIXME_OPENMP We are forced to use t_openmp_instance because the function is
+  // static
+  return OpenMP::in_parallel()
+             ? omp_get_num_threads()
+             : (Impl::t_openmp_instance
+                    ? Impl::t_openmp_instance->m_pool_size
+                    : Impl::OpenMPInternal::singleton().m_pool_size);
 }
 
 KOKKOS_INLINE_FUNCTION
 int OpenMP::impl_thread_pool_rank() noexcept {
+  // FIXME_OPENMP We are forced to use t_openmp_instance because the function is
+  // static
   KOKKOS_IF_ON_HOST(
       (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();))
 
   KOKKOS_IF_ON_DEVICE((return -1;))
 }
 
-inline void OpenMP::impl_static_fence(OpenMP const& /**instance*/,
-                                      const std::string& name) noexcept {
+inline void OpenMP::impl_static_fence(std::string const& name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
       name,
       Kokkos::Tools::Experimental::SpecialSynchronizationCases::
@@ -179,9 +188,9 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
 #else
   if (omp_get_nested()) {
 #endif
-    using Exec = Impl::OpenMPExec;
+    using Exec = Impl::OpenMPInternal;
 
-    Exec* prev_instance = Impl::t_openmp_instance;
+    Exec* prev_instance = &Impl::OpenMPInternal::singleton();
 
     Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
                                   partition_size);
@@ -190,35 +199,22 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
 
 #pragma omp parallel num_threads(num_partitions)
     {
-      void* ptr = nullptr;
-      try {
-        ptr = space.allocate(sizeof(Exec));
-      } catch (
-          Kokkos::Experimental::RawMemoryAllocationFailure const& failure) {
-        // For now, just rethrow the error message the existing way
-        Kokkos::Impl::throw_runtime_exception(failure.get_error_message());
-      }
-
-      Impl::t_openmp_instance = new (ptr) Exec(partition_size);
+      Exec thread_local_instance(partition_size);
+      Impl::t_openmp_instance = &thread_local_instance;
 
       size_t pool_reduce_bytes  = 32 * partition_size;
       size_t team_reduce_bytes  = 32 * partition_size;
       size_t team_shared_bytes  = 1024 * partition_size;
       size_t thread_local_bytes = 1024;
 
-      Impl::t_openmp_instance->resize_thread_data(
+      thread_local_instance.resize_thread_data(
           pool_reduce_bytes, team_reduce_bytes, team_shared_bytes,
           thread_local_bytes);
 
       omp_set_num_threads(partition_size);
       f(omp_get_thread_num(), omp_get_num_threads());
-
-      Impl::t_openmp_instance->~Exec();
-      space.deallocate(Impl::t_openmp_instance, sizeof(Exec));
       Impl::t_openmp_instance = nullptr;
     }
-
-    Impl::t_openmp_instance = prev_instance;
   } else {
     // nested openmp not enabled
     f(0, 1);
@@ -368,4 +364,3 @@ inline int OpenMP::impl_max_hardware_threads() noexcept {
 }  // namespace Kokkos
 
 #endif
-#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index 764dd9065..94c465dc2 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -49,14 +49,26 @@
 #if defined(KOKKOS_ENABLE_OPENMP)
 
 #include <omp.h>
-#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#define KOKKOS_PRAGMA_IVDEP_IF_ENABLED _Pragma("ivdep")
+#endif
+
+#ifndef KOKKOS_COMPILER_NVHPC
+#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE , m_policy.chunk_size()
+#else
+#define KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -68,84 +80,105 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> {
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
 
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend) {
-#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#endif
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(iwork);
+  inline static void exec_range(const FunctorType& functor, const Member ibeg,
+                                const Member iend) {
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = ibeg; iwork < iend; ++iwork) {
+      exec_work(functor, iwork);
     }
   }
 
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend) {
-    const TagType t{};
-#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#endif
-    for (Member iwork = ibeg; iwork < iend; ++iwork) {
-      functor(t, iwork);
+  template <class Enable = WorkTag>
+  inline static std::enable_if_t<std::is_void<WorkTag>::value &&
+                                 std::is_same<Enable, WorkTag>::value>
+  exec_work(const FunctorType& functor, const Member iwork) {
+    functor(iwork);
+  }
+
+  template <class Enable = WorkTag>
+  inline static std::enable_if_t<!std::is_void<WorkTag>::value &&
+                                 std::is_same<Enable, WorkTag>::value>
+  exec_work(const FunctorType& functor, const Member iwork) {
+    functor(WorkTag{}, iwork);
+  }
+
+  template <class Policy>
+  std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
+                                Kokkos::Dynamic>::value>
+  execute_parallel() const {
+    // prevent bug in NVHPC 21.9/CUDA 11.4 (entering zero iterations loop)
+    if (m_policy.begin() >= m_policy.end()) return;
+#pragma omp parallel for schedule(dynamic KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(OpenMP::impl_thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      exec_work(m_functor, iwork);
+    }
+  }
+
+  template <class Policy>
+  std::enable_if_t<!std::is_same<typename Policy::schedule_type::type,
+                                 Kokkos::Dynamic>::value>
+  execute_parallel() const {
+#pragma omp parallel for schedule(static KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(OpenMP::impl_thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      exec_work(m_functor, iwork);
     }
   }
 
  public:
   inline void execute() const {
-    enum {
-      is_dynamic = std::is_same<typename Policy::schedule_type::type,
-                                Kokkos::Dynamic>::value
-    };
-
     if (OpenMP::in_parallel()) {
-      exec_range<WorkTag>(m_functor, m_policy.begin(), m_policy.end());
-    } else {
-      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+      exec_range(m_functor, m_policy.begin(), m_policy.end());
+      return;
+    }
 
+#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
+    execute_parallel<Policy>();
+#else
+    constexpr bool is_dynamic =
+        std::is_same<typename Policy::schedule_type::type,
+                     Kokkos::Dynamic>::value;
 #pragma omp parallel num_threads(OpenMP::impl_thread_pool_size())
-      {
-        HostThreadTeamData& data = *(m_instance->get_thread_data());
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
 
-        data.set_work_partition(m_policy.end() - m_policy.begin(),
-                                m_policy.chunk_size());
+      data.set_work_partition(m_policy.end() - m_policy.begin(),
+                              m_policy.chunk_size());
 
-        if (is_dynamic) {
-          // Make sure work partition is set before stealing
-          if (data.pool_rendezvous()) data.pool_rendezvous_release();
-        }
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
 
-        std::pair<int64_t, int64_t> range(0, 0);
+      std::pair<int64_t, int64_t> range(0, 0);
 
-        do {
-          range = is_dynamic ? data.get_work_stealing_chunk()
-                             : data.get_work_partition();
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
 
-          ParallelFor::template exec_range<WorkTag>(
-              m_functor, range.first + m_policy.begin(),
-              range.second + m_policy.begin());
+        exec_range(m_functor, range.first + m_policy.begin(),
+                   range.second + m_policy.begin());
 
-        } while (is_dynamic && 0 <= range.first);
-      }
+      } while (is_dynamic && 0 <= range.first);
     }
+#endif
   }
 
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
-      : m_instance(t_openmp_instance),
-        m_functor(arg_functor),
-        m_policy(arg_policy) {}
+      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
 };
 
 // MDRangePolicy impl
@@ -163,7 +196,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   using iterate_type = typename Kokkos::Impl::HostIterateTile<
       MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const MDRangePolicy m_mdr_policy;
   const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
@@ -172,62 +205,90 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   inline static void exec_range(const MDRangePolicy& mdr_policy,
                                 const FunctorType& functor, const Member ibeg,
                                 const Member iend) {
-#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#endif
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       iterate_type(mdr_policy, functor)(iwork);
     }
   }
 
+  template <class Policy>
+  typename std::enable_if_t<std::is_same<typename Policy::schedule_type::type,
+                                         Kokkos::Dynamic>::value>
+  execute_parallel() const {
+#pragma omp parallel for schedule(dynamic KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(OpenMP::impl_thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      iterate_type(m_mdr_policy, m_functor)(iwork);
+    }
+  }
+
+  template <class Policy>
+  typename std::enable_if<!std::is_same<typename Policy::schedule_type::type,
+                                        Kokkos::Dynamic>::value>::type
+  execute_parallel() const {
+#pragma omp parallel for schedule(static KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE) \
+    num_threads(OpenMP::impl_thread_pool_size())
+    KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+    for (auto iwork = m_policy.begin(); iwork < m_policy.end(); ++iwork) {
+      iterate_type(m_mdr_policy, m_functor)(iwork);
+    }
+  }
+
  public:
   inline void execute() const {
-    enum {
-      is_dynamic = std::is_same<typename Policy::schedule_type::type,
-                                Kokkos::Dynamic>::value
-    };
-
     if (OpenMP::in_parallel()) {
       ParallelFor::exec_range(m_mdr_policy, m_functor, m_policy.begin(),
                               m_policy.end());
-    } else {
-      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+      return;
+    }
+
+#ifndef KOKKOS_INTERNAL_DISABLE_NATIVE_OPENMP
+    execute_parallel<Policy>();
+#else
+    constexpr bool is_dynamic =
+        std::is_same<typename Policy::schedule_type::type,
+                     Kokkos::Dynamic>::value;
 
 #pragma omp parallel num_threads(OpenMP::impl_thread_pool_size())
-      {
-        HostThreadTeamData& data = *(m_instance->get_thread_data());
+    {
+      HostThreadTeamData& data = *(m_instance->get_thread_data());
 
-        data.set_work_partition(m_policy.end() - m_policy.begin(),
-                                m_policy.chunk_size());
+      data.set_work_partition(m_policy.end() - m_policy.begin(),
+                              m_policy.chunk_size());
 
-        if (is_dynamic) {
-          // Make sure work partition is set before stealing
-          if (data.pool_rendezvous()) data.pool_rendezvous_release();
-        }
+      if (is_dynamic) {
+        // Make sure work partition is set before stealing
+        if (data.pool_rendezvous()) data.pool_rendezvous_release();
+      }
 
-        std::pair<int64_t, int64_t> range(0, 0);
+      std::pair<int64_t, int64_t> range(0, 0);
 
-        do {
-          range = is_dynamic ? data.get_work_stealing_chunk()
-                             : data.get_work_partition();
+      do {
+        range = is_dynamic ? data.get_work_stealing_chunk()
+                           : data.get_work_partition();
 
-          ParallelFor::exec_range(m_mdr_policy, m_functor,
-                                  range.first + m_policy.begin(),
-                                  range.second + m_policy.begin());
+        ParallelFor::exec_range(m_mdr_policy, m_functor,
+                                range.first + m_policy.begin(),
+                                range.second + m_policy.begin());
 
-        } while (is_dynamic && 0 <= range.first);
-      }
-      // END #pragma omp parallel
+      } while (is_dynamic && 0 <= range.first);
     }
+    // END #pragma omp parallel
+#endif
   }
 
   inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy&, const Functor&) {
     /**
@@ -258,9 +319,6 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
 
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
-
   using ReducerConditional =
       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                          FunctorType, ReducerType>;
@@ -270,34 +328,31 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                          void>;
 
   // Static Assert WorkTag void if ReducerType not InvalidType
-
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
 
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update) {
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(iwork, update);
     }
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update) {
     const TagType t{};
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(t, iwork, update);
@@ -306,12 +361,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
  public:
   inline void execute() const {
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     if (m_policy.end() <= m_policy.begin()) {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), m_result_ptr);
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
       }
       return;
     }
@@ -320,8 +376,6 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                 Kokkos::Dynamic>::value
     };
 
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
-
     const size_t pool_reduce_bytes =
         Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
 
@@ -345,9 +399,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         if (data.pool_rendezvous()) data.pool_rendezvous_release();
       }
 
-      reference_type update =
-          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          data.pool_reduce_local());
+      reference_type update = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
 
       std::pair<int64_t, int64_t> range(0, 0);
 
@@ -368,12 +421,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
 
     for (int i = 1; i < pool_size; ++i) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
-                      m_instance->get_thread_data(i)->pool_reduce_local());
+      final_reducer.join(
+          ptr, reinterpret_cast<pointer_type>(
+                   m_instance->get_thread_data(i)->pool_reduce_local()));
     }
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
+    final_reducer.final(ptr);
 
     if (m_result_ptr) {
       const int n = Analysis::value_count(
@@ -391,14 +444,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   inline ParallelReduce(
       const FunctorType& arg_functor, Policy arg_policy,
       const ViewType& arg_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_instance(t_openmp_instance),
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void*> = nullptr)
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_view.data()) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
     /*static_assert( std::is_same< typename ViewType::memory_space
                                     , Kokkos::HostSpace >::value
       , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
@@ -407,11 +465,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
     /*static_assert( std::is_same< typename ViewType::memory_space
                                     , Kokkos::HostSpace >::value
       , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
@@ -431,9 +494,6 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
 
-  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
-                                   MDRangePolicy, FunctorType>;
-
   using ReducerConditional =
       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                          FunctorType, ReducerType>;
@@ -442,8 +502,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
+                                   MDRangePolicy, ReducerTypeFwd>;
 
   using pointer_type   = typename Analysis::pointer_type;
   using value_type     = typename Analysis::value_type;
@@ -453,7 +513,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
                                              WorkTag, reference_type>;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const MDRangePolicy m_mdr_policy;
   const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
@@ -476,8 +536,6 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                 Kokkos::Dynamic>::value
     };
 
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
-
     const size_t pool_reduce_bytes =
         Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
 
@@ -488,6 +546,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                    0  // thread_local_bytes
     );
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     const int pool_size = OpenMP::impl_thread_pool_size();
 #pragma omp parallel num_threads(pool_size)
     {
@@ -501,9 +562,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         if (data.pool_rendezvous()) data.pool_rendezvous_release();
       }
 
-      reference_type update =
-          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          data.pool_reduce_local());
+      reference_type update = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
 
       std::pair<int64_t, int64_t> range(0, 0);
 
@@ -525,12 +585,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
 
     for (int i = 1; i < pool_size; ++i) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
-                      m_instance->get_thread_data(i)->pool_reduce_local());
+      final_reducer.join(
+          ptr, reinterpret_cast<pointer_type>(
+                   m_instance->get_thread_data(i)->pool_reduce_local()));
     }
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
+    final_reducer.final(ptr);
 
     if (m_result_ptr) {
       const int n = Analysis::value_count(
@@ -548,15 +608,20 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline ParallelReduce(
       const FunctorType& arg_functor, MDRangePolicy arg_policy,
       const ViewType& arg_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_instance(t_openmp_instance),
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void*> = nullptr)
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_mdr_policy(arg_policy),
         m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
         m_reducer(InvalidType()),
         m_result_ptr(arg_view.data()) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
     /*static_assert( std::is_same< typename ViewType::memory_space
                                     , Kokkos::HostSpace >::value
       , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
@@ -565,12 +630,17 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
   inline ParallelReduce(const FunctorType& arg_functor,
                         MDRangePolicy arg_policy, const ReducerType& reducer)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_mdr_policy(arg_policy),
         m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
     /*static_assert( std::is_same< typename ViewType::memory_space
                                     , Kokkos::HostSpace >::value
       , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
@@ -609,32 +679,26 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
 
-  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-  using ValueOps  = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update, const bool final) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(iwork, update, final);
     }
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update, const bool final) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
     const TagType t{};
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(t, iwork, update, final);
@@ -643,8 +707,6 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
  public:
   inline void execute() const {
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
-
     const int value_count          = Analysis::value_count(m_functor);
     const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
 
@@ -658,12 +720,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp parallel num_threads(OpenMP::impl_thread_pool_size())
     {
       HostThreadTeamData& data = *(m_instance->get_thread_data());
+      typename Analysis::Reducer final_reducer(&m_functor);
 
       const WorkRange range(m_policy, omp_get_thread_num(),
                             omp_get_num_threads());
 
-      reference_type update_sum =
-          ValueInit::init(m_functor, data.pool_reduce_local());
+      reference_type update_sum = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
 
       ParallelScan::template exec_range<WorkTag>(
           m_functor, range.begin(), range.end(), update_sum, false);
@@ -681,9 +744,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
             for (int j = 0; j < value_count; ++j) {
               ptr[j + value_count] = ptr_prev[j + value_count];
             }
-            ValueJoin::join(m_functor, ptr + value_count, ptr_prev);
+            final_reducer.join(ptr + value_count, ptr_prev);
           } else {
-            ValueInit::init(m_functor, ptr + value_count);
+            final_reducer.init(ptr + value_count);
           }
 
           ptr_prev = ptr;
@@ -692,8 +755,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         data.pool_rendezvous_release();
       }
 
-      reference_type update_base = ValueOps::reference(
-          ((pointer_type)data.pool_reduce_local()) + value_count);
+      reference_type update_base = final_reducer.reference(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
+          value_count);
 
       ParallelScan::template exec_range<WorkTag>(
           m_functor, range.begin(), range.end(), update_base, true);
@@ -703,9 +767,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   //----------------------------------------
 
   inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_instance(t_openmp_instance),
-        m_functor(arg_functor),
-        m_policy(arg_policy) {}
+      : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
 
   //----------------------------------------
 };
@@ -723,33 +791,27 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using WorkRange = typename Policy::WorkRange;
   using Member    = typename Policy::member_type;
 
-  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-  using ValueOps  = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
   ReturnType& m_returnvalue;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update, const bool final) {
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(iwork, update, final);
     }
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType& functor, const Member ibeg,
-                 const Member iend, reference_type update, const bool final) {
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType& functor, const Member ibeg, const Member iend,
+      reference_type update, const bool final) {
     const TagType t{};
     for (Member iwork = ibeg; iwork < iend; ++iwork) {
       functor(t, iwork, update, final);
@@ -758,8 +820,6 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
  public:
   inline void execute() const {
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
-
     const int value_count          = Analysis::value_count(m_functor);
     const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor);
 
@@ -773,11 +833,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp parallel num_threads(OpenMP::impl_thread_pool_size())
     {
       HostThreadTeamData& data = *(m_instance->get_thread_data());
+      typename Analysis::Reducer final_reducer(&m_functor);
 
       const WorkRange range(m_policy, omp_get_thread_num(),
                             omp_get_num_threads());
-      reference_type update_sum =
-          ValueInit::init(m_functor, data.pool_reduce_local());
+      reference_type update_sum = final_reducer.init(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()));
 
       ParallelScanWithTotal::template exec_range<WorkTag>(
           m_functor, range.begin(), range.end(), update_sum, false);
@@ -795,9 +856,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
             for (int j = 0; j < value_count; ++j) {
               ptr[j + value_count] = ptr_prev[j + value_count];
             }
-            ValueJoin::join(m_functor, ptr + value_count, ptr_prev);
+            final_reducer.join(ptr + value_count, ptr_prev);
           } else {
-            ValueInit::init(m_functor, ptr + value_count);
+            final_reducer.init(ptr + value_count);
           }
 
           ptr_prev = ptr;
@@ -806,8 +867,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         data.pool_rendezvous_release();
       }
 
-      reference_type update_base = ValueOps::reference(
-          ((pointer_type)data.pool_reduce_local()) + value_count);
+      reference_type update_base = final_reducer.reference(
+          reinterpret_cast<pointer_type>(data.pool_reduce_local()) +
+          value_count);
 
       ParallelScanWithTotal::template exec_range<WorkTag>(
           m_functor, range.begin(), range.end(), update_base, true);
@@ -823,10 +885,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   inline ParallelScanWithTotal(const FunctorType& arg_functor,
                                const Policy& arg_policy,
                                ReturnType& arg_returnvalue)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
-        m_returnvalue(arg_returnvalue) {}
+        m_returnvalue(arg_returnvalue) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
 
   //----------------------------------------
 };
@@ -852,17 +920,16 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using SchedTag = typename Policy::schedule_type::type;
   using Member   = typename Policy::member_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
-  const int m_shmem_size;
+  const size_t m_shmem_size;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<(std::is_same<TagType, void>::value)>::type
-      exec_team(const FunctorType& functor, HostThreadTeamData& data,
-                const int league_rank_begin, const int league_rank_end,
-                const int league_size) {
+  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      const int league_rank_begin, const int league_rank_end,
+      const int league_size) {
     for (int r = league_rank_begin; r < league_rank_end;) {
       functor(Member(data, r, league_size));
 
@@ -877,11 +944,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<(!std::is_same<TagType, void>::value)>::type
-      exec_team(const FunctorType& functor, HostThreadTeamData& data,
-                const int league_rank_begin, const int league_rank_end,
-                const int league_size) {
+  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      const int league_rank_begin, const int league_rank_end,
+      const int league_size) {
     const TagType t{};
 
     for (int r = league_rank_begin; r < league_rank_end;) {
@@ -901,11 +967,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline void execute() const {
     enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
 
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
-
     const size_t pool_reduce_size  = 0;  // Never shrinks
     const size_t team_reduce_size  = TEAM_REDUCE_SIZE * m_policy.team_size();
-    const size_t team_shared_size  = m_shmem_size + m_policy.scratch_size(1);
+    const size_t team_shared_size  = m_shmem_size;
     const size_t thread_local_size = 0;  // Never shrinks
 
     m_instance->resize_thread_data(pool_reduce_size, team_reduce_size,
@@ -949,12 +1013,18 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                      FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {}
+                         arg_functor, arg_policy.team_size())) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -968,9 +1038,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Policy =
       Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>;
 
-  using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
-
   using WorkTag  = typename Policy::work_tag;
   using SchedTag = typename Policy::schedule_type::type;
   using Member   = typename Policy::member_type;
@@ -984,13 +1051,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
 
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
 
-  OpenMPExec* m_instance;
+  OpenMPInternal* m_instance;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
@@ -998,11 +1065,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
   template <class TagType>
-  inline static
-      typename std::enable_if<(std::is_same<TagType, void>::value)>::type
-      exec_team(const FunctorType& functor, HostThreadTeamData& data,
-                reference_type& update, const int league_rank_begin,
-                const int league_rank_end, const int league_size) {
+  inline static std::enable_if_t<(std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      reference_type& update, const int league_rank_begin,
+      const int league_rank_end, const int league_size) {
     for (int r = league_rank_begin; r < league_rank_end;) {
       functor(Member(data, r, league_size), update);
 
@@ -1017,11 +1083,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class TagType>
-  inline static
-      typename std::enable_if<(!std::is_same<TagType, void>::value)>::type
-      exec_team(const FunctorType& functor, HostThreadTeamData& data,
-                reference_type& update, const int league_rank_begin,
-                const int league_rank_end, const int league_size) {
+  inline static std::enable_if_t<(!std::is_void<TagType>::value)> exec_team(
+      const FunctorType& functor, HostThreadTeamData& data,
+      reference_type& update, const int league_rank_begin,
+      const int league_rank_end, const int league_size) {
     const TagType t{};
 
     for (int r = league_rank_begin; r < league_rank_end;) {
@@ -1041,16 +1106,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline void execute() const {
     enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value };
 
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
     if (m_policy.league_size() == 0 || m_policy.team_size() == 0) {
       if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-            ReducerConditional::select(m_functor, m_reducer), m_result_ptr);
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
       }
       return;
     }
-    OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
 
     const size_t pool_reduce_size =
         Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
@@ -1083,9 +1148,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
 
       if (active) {
-        reference_type update =
-            ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                            data.pool_reduce_local());
+        reference_type update = final_reducer.init(
+            reinterpret_cast<pointer_type>(data.pool_reduce_local()));
 
         std::pair<int64_t, int64_t> range(0, 0);
 
@@ -1099,8 +1163,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         } while (is_dynamic && 0 <= range.first);
       } else {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        data.pool_reduce_local());
+        final_reducer.init(
+            reinterpret_cast<pointer_type>(data.pool_reduce_local()));
       }
 
       data.disband_team();
@@ -1122,12 +1186,12 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         pointer_type(m_instance->get_thread_data(0)->pool_reduce_local());
 
     for (int i = 1; i < pool_size; ++i) {
-      ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
-                      m_instance->get_thread_data(i)->pool_reduce_local());
+      final_reducer.join(
+          ptr, reinterpret_cast<pointer_type>(
+                   m_instance->get_thread_data(i)->pool_reduce_local()));
     }
 
-    Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
-        ReducerConditional::select(m_functor, m_reducer), ptr);
+    final_reducer.final(ptr);
 
     if (m_result_ptr) {
       const int n = Analysis::value_count(
@@ -1145,21 +1209,27 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline ParallelReduce(
       const FunctorType& arg_functor, const Policy& arg_policy,
       const ViewType& arg_result,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
-      : m_instance(t_openmp_instance),
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void*> = nullptr)
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                      FunctorTeamShmemSize<FunctorType>::value(
-                         arg_functor, arg_policy.team_size())) {}
+                         arg_functor, arg_policy.team_size())) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
+  }
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_instance(t_openmp_instance),
+      : m_instance(nullptr),
         m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
@@ -1167,6 +1237,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {
+    if (t_openmp_instance) {
+      m_instance = t_openmp_instance;
+    } else {
+      m_instance = arg_policy.space().impl_internal_space_instance();
+    }
     /*static_assert( std::is_same< typename ViewType::memory_space
                             , Kokkos::HostSpace >::value
     , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
@@ -1180,5 +1255,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_PRAGMA_IVDEP_IF_ENABLED
+#undef KOKKOS_OPENMP_OPTIONAL_CHUNK_SIZE
+
 #endif
 #endif /* KOKKOS_OPENMP_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
index f7338819a..4babcf03d 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_OPENMP) && defined(KOKKOS_ENABLE_TASKDAG)
 
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index d9234e341..ec1ede0e2 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -75,7 +75,7 @@ class HostThreadTeamDataSingleton : private HostThreadTeamData {
 // TODO @tasking @cleanup DSH Make this the general class template and make the
 // old code the partial specialization
 template <class QueueType>
-class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType> > {
+class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType>> {
  public:
   using execution_space = Kokkos::OpenMP;
   using scheduler_type  = SimpleTaskScheduler<Kokkos::OpenMP, QueueType>;
@@ -96,10 +96,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType> > {
     // HostThreadTeamData& team_data_single =
     // HostThreadTeamDataSingleton::singleton();
 
-    // TODO @tasking @generalization DSH use
-    // scheduler.get_execution_space().impl() (or something like that) instead
-    // of the thread-local variable
-    Impl::OpenMPExec* instance = t_openmp_instance;
+    Impl::OpenMPInternal* instance =
+        execution_space().impl_internal_space_instance();
     const int pool_size = get_max_team_count(scheduler.get_execution_space());
 
     // TODO @tasking @new_feature DSH allow team sizes other than 1
@@ -198,8 +196,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType> > {
 template <class Scheduler>
 class TaskQueueSpecializationConstrained<
     Scheduler,
-    typename std::enable_if<std::is_same<typename Scheduler::execution_space,
-                                         Kokkos::OpenMP>::value>::type> {
+    std::enable_if_t<std::is_same<typename Scheduler::execution_space,
+                                  Kokkos::OpenMP>::value>> {
  public:
   using execution_space = Kokkos::OpenMP;
   using scheduler_type  = Scheduler;
@@ -258,8 +256,9 @@ class TaskQueueSpecializationConstrained<
     HostThreadTeamData& team_data_single =
         HostThreadTeamDataSingleton::singleton();
 
-    Impl::OpenMPExec* instance = t_openmp_instance;
-    const int pool_size        = OpenMP::impl_thread_pool_size();
+    Impl::OpenMPInternal* instance =
+        execution_space().impl_internal_space_instance();
+    const int pool_size = OpenMP::impl_thread_pool_size();
 
     const int team_size = 1;       // Threads per core
     instance->resize_thread_data(0 /* global reduce buffer */
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
index be7afd328..73533178b 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
@@ -48,7 +48,7 @@
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_OPENMP)
 
-#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Instance.hpp>
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
index 92e4ee636..55d9c5860 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@@ -60,13 +60,13 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<!std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
index c95951a11..5ff9bf32b 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <algorithm>
@@ -61,6 +65,7 @@
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_MemorySpace.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -70,23 +75,66 @@ namespace Experimental {
 /* Default allocation mechanism */
 OpenMPTargetSpace::OpenMPTargetSpace() {}
 
-void *OpenMPTargetSpace::allocate(const size_t arg_alloc_size) const {
-  static_assert(sizeof(void *) == sizeof(uintptr_t),
+void* OpenMPTargetSpace::impl_allocate(
+
+    const char* arg_label, const size_t arg_alloc_size,
+    const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  static_assert(sizeof(void*) == sizeof(uintptr_t),
                 "Error sizeof(void*) != sizeof(uintptr_t)");
 
-  void *ptr;
+  void* ptr;
 
   ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device());
 
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size);
+  }
+
   return ptr;
 }
 
-void OpenMPTargetSpace::deallocate(void *const arg_alloc_ptr,
-                                   const size_t /*arg_alloc_size*/) const {
+void* OpenMPTargetSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+
+void* OpenMPTargetSpace::allocate(const char* arg_label,
+                                  const size_t arg_alloc_size,
+                                  const size_t arg_logical_size) const {
+  return impl_allocate(arg_label, arg_alloc_size, arg_logical_size);
+}
+
+void OpenMPTargetSpace::impl_deallocate(
+    const char* arg_label, void* const arg_alloc_ptr,
+    const size_t arg_alloc_size, const size_t arg_logical_size,
+    const Kokkos::Tools::SpaceHandle arg_handle) const {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
+                                      reported_size);
+  }
   if (arg_alloc_ptr) {
     omp_target_free(arg_alloc_ptr, omp_get_default_device());
   }
 }
+
+void OpenMPTargetSpace::deallocate(void* const arg_alloc_ptr,
+                                   const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void OpenMPTargetSpace::deallocate(const char* arg_label,
+                                   void* const arg_alloc_ptr,
+                                   const size_t arg_alloc_size,
+                                   const size_t arg_logical_size) const
+
+{
+  impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -103,14 +151,16 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
 
 SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
                        void>::~SharedAllocationRecord() {
-  m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
+  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(m_label.c_str(),
+                     SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
 }
 
 SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
     SharedAllocationRecord(
-        const Kokkos::Experimental::OpenMPTargetSpace &arg_space,
-        const std::string &arg_label, const size_t arg_alloc_size,
+        const Kokkos::Experimental::OpenMPTargetSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
         const SharedAllocationRecord<void, void>::function_type arg_dealloc)
     // Pass through allocated [ SharedAllocationHeader , user_memory ]
     // Pass through deallocation function
@@ -119,8 +169,8 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
           &SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace,
                                   void>::s_root_record,
 #endif
-          reinterpret_cast<SharedAllocationHeader *>(arg_space.allocate(
-              sizeof(SharedAllocationHeader) + arg_alloc_size)),
+          Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label,
+                                                       arg_alloc_size),
           sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
           arg_label),
       m_space(arg_space) {
@@ -143,42 +193,6 @@ SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>::
 }  // namespace Impl
 }  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template <class>
-struct ViewOperatorBoundsErrorAbort;
-
-template <>
-struct ViewOperatorBoundsErrorAbort<Kokkos::Experimental::OpenMPTargetSpace> {
-  static void apply(const size_t rank, const size_t n0, const size_t n1,
-                    const size_t n2, const size_t n3, const size_t n4,
-                    const size_t n5, const size_t n6, const size_t n7,
-                    const size_t i0, const size_t i1, const size_t i2,
-                    const size_t i3, const size_t i4, const size_t i5,
-                    const size_t i6, const size_t i7);
-};
-
-void ViewOperatorBoundsErrorAbort<Kokkos::Experimental::OpenMPTargetSpace>::
-    apply(const size_t rank, const size_t n0, const size_t n1, const size_t n2,
-          const size_t n3, const size_t n4, const size_t n5, const size_t n6,
-          const size_t n7, const size_t i0, const size_t i1, const size_t i2,
-          const size_t i3, const size_t i4, const size_t i5, const size_t i6,
-          const size_t i7) {
-  printf(
-      "View operator bounds error : rank(%lu) "
-      "dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) "
-      "index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)",
-      rank, n0, n1, n2, n3, n4, n5, n6, n7, i0, i1, i2, i3, i4, i5, i6, i7);
-  // Kokkos::Impl::throw_runtime_exception( buffer );
-}
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 /*
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index 7ff885ed8..d3bec5aeb 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <stdio.h>
 #include <limits>
 #include <iostream>
@@ -118,7 +122,8 @@ void OpenMPTargetExec::clear_lock_array() {
 void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
 
 void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
-                                      int64_t shmem_size_L1) {
+                                      int64_t shmem_size_L1,
+                                      int64_t league_size) {
   Kokkos::Experimental::OpenMPTargetSpace space;
   const int64_t shmem_size =
       shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
@@ -127,7 +132,7 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
   // on the maximum number of in-flight teams possible.
   int64_t total_size =
       (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) *
-      (MAX_ACTIVE_THREADS / team_size);
+      std::min(MAX_ACTIVE_THREADS / team_size, league_size);
 
   if (total_size > m_scratch_size) {
     space.deallocate(m_scratch_ptr, m_scratch_size);
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 9d0507847..52f5dcb83 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -59,6 +59,13 @@
 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 #endif
 
+// FIXME_OPENMPTARGET - Using this macro to implement a workaround for
+// hierarchical scan. It avoids hitting the code path which we wanted to
+// write but doesn't work. undef'ed at the end.
+#ifndef KOKKOS_ARCH_INTEL_GPU
+#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#endif
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -69,33 +76,23 @@ template <class Reducer>
 struct OpenMPTargetReducerWrapper {
   using value_type = typename Reducer::value_type;
 
+  // Using a generic unknown Reducer for the OpenMPTarget backend is not
+  // implemented.
   KOKKOS_INLINE_FUNCTION
-  static void join(value_type&, const value_type&) {
-    printf(
-        "Using a generic unknown Reducer for the OpenMPTarget backend is not "
-        "implemented.");
-  }
+  static void join(value_type&, const value_type&) = delete;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type&, const volatile value_type&) {
-    printf(
-        "Using a generic unknown Reducer for the OpenMPTarget backend is not "
-        "implemented.");
-  }
+  static void join(volatile value_type&, const volatile value_type&) = delete;
 
   KOKKOS_INLINE_FUNCTION
-  static void init(value_type&) {
-    printf(
-        "Using a generic unknown Reducer for the OpenMPTarget backend is not "
-        "implemented.");
-  }
+  static void init(value_type&) = delete;
 };
 
 template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -116,7 +113,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -137,7 +134,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -160,7 +157,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -184,7 +181,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -206,7 +203,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
@@ -231,7 +228,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -254,7 +251,7 @@ template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
  public:
   // Required
-  using value_type = typename std::remove_cv<Scalar>::type;
+  using value_type = std::remove_cv_t<Scalar>;
 
   // Required
   KOKKOS_INLINE_FUNCTION
@@ -276,8 +273,8 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -304,8 +301,8 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -331,7 +328,7 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
 template <class Scalar, class Space>
 struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
 
  public:
   // Required
@@ -368,8 +365,8 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -415,8 +412,8 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MaxFirstLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -458,8 +455,8 @@ struct OpenMPTargetReducerWrapper<MaxFirstLoc<Scalar, Index, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MinFirstLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -501,8 +498,8 @@ struct OpenMPTargetReducerWrapper<MinFirstLoc<Scalar, Index, Space>> {
 template <class Scalar, class Index, class Space>
 struct OpenMPTargetReducerWrapper<MinMaxFirstLastLoc<Scalar, Index, Space>> {
  private:
-  using scalar_type = typename std::remove_cv<Scalar>::type;
-  using index_type  = typename std::remove_cv<Index>::type;
+  using scalar_type = std::remove_cv_t<Scalar>;
+  using index_type  = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -563,7 +560,7 @@ struct OpenMPTargetReducerWrapper<MinMaxFirstLastLoc<Scalar, Index, Space>> {
 template <class Index, class Space>
 struct OpenMPTargetReducerWrapper<FirstLoc<Index, Space>> {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -601,7 +598,7 @@ struct OpenMPTargetReducerWrapper<FirstLoc<Index, Space>> {
 template <class Index, class Space>
 struct OpenMPTargetReducerWrapper<LastLoc<Index, Space>> {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -639,7 +636,7 @@ struct OpenMPTargetReducerWrapper<LastLoc<Index, Space>> {
 template <class Index, class Space>
 struct OpenMPTargetReducerWrapper<StdIsPartitioned<Index, Space>> {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -686,7 +683,7 @@ struct OpenMPTargetReducerWrapper<StdIsPartitioned<Index, Space>> {
 template <class Index, class Space>
 struct OpenMPTargetReducerWrapper<StdPartitionPoint<Index, Space>> {
  private:
-  using index_type = typename std::remove_cv<Index>::type;
+  using index_type = std::remove_cv_t<Index>;
 
  public:
   // Required
@@ -768,7 +765,7 @@ class OpenMPTargetExec {
   static void clear_lock_array();
   static void resize_scratch(int64_t team_reduce_bytes,
                              int64_t team_shared_bytes,
-                             int64_t thread_local_bytes);
+                             int64_t thread_local_bytes, int64_t league_size);
 
   static void* m_scratch_ptr;
   static int64_t m_scratch_size;
@@ -797,7 +794,7 @@ class OpenMPTargetExecTeamMember {
   using scratch_memory_space = execution_space::scratch_memory_space;
 
   scratch_memory_space m_team_shared;
-  int m_team_scratch_size[2];
+  size_t m_team_scratch_size[2];
   int m_team_rank;
   int m_team_size;
   int m_league_rank;
@@ -845,9 +842,8 @@ class OpenMPTargetExecTeamMember {
   KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
                                              int thread_id) const {
     // Make sure there is enough scratch space:
-    using type =
-        typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
-                                  ValueType, void>::type;
+    using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
+                                    ValueType, void>;
     type* team_scratch =
         reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
                                 TEAM_REDUCE_SIZE * omp_get_team_num());
@@ -864,13 +860,15 @@ class OpenMPTargetExecTeamMember {
     team_broadcast(value, thread_id);
   }
 
+  // FIXME_OPENMPTARGET this function has the wrong interface and currently
+  // ignores the reducer passed.
   template <class ValueType, class JoinOp>
   KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
-                                               const JoinOp& op_in) const {
+                                               const JoinOp&) const {
 #pragma omp barrier
 
     using value_type = ValueType;
-    const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
+    //    const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
 
     // Make sure there is enough scratch space:
     using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
@@ -983,7 +981,7 @@ class OpenMPTargetExecTeamMember {
                                // Properties ...> & team
       ,
       void* const glb_scratch, const int shmem_block_index,
-      const int shmem_size_L0, const int shmem_size_L1)
+      const size_t shmem_size_L0, const size_t shmem_size_L1)
       : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
         m_team_rank(0),
         m_team_size(team_size),
@@ -1322,11 +1320,10 @@ TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::OpenMPTargetExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
                 const iType1& begin, const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<
       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
                                                iType(end));
@@ -1343,11 +1340,10 @@ ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::OpenMPTargetExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
                   const iType1& arg_begin, const iType2& arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<
       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
                                                iType(arg_end));
@@ -1364,11 +1360,10 @@ TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::OpenMPTargetExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::OpenMPTargetExecTeamMember>
 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
                 const iType1& arg_begin, const iType2& arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamVectorRangeBoundariesStruct<
       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
                                                iType(arg_end));
@@ -1411,12 +1406,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  */
 
 template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<!Kokkos::is_reducer_type<ValueType>::value>
-    parallel_reduce(
-        const Impl::TeamThreadRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
   // elements in the array <= 32. For reduction we allocate, 16 bytes per
   // element in the scratch space, hence, 16*32 = 512.
@@ -1456,12 +1449,10 @@ KOKKOS_INLINE_FUNCTION
 // and crashes. We should try this with every new compiler
 // This is the variant we actually wanted to write
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
-    parallel_reduce(
-        const Impl::TeamThreadRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ReducerType result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType result) {
   using ValueType = typename ReducerType::value_type;
 
 #pragma omp declare reduction(                                               \
@@ -1491,12 +1482,10 @@ KOKKOS_INLINE_FUNCTION
 }
 #else
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
-    parallel_reduce(
-        const Impl::TeamThreadRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ReducerType result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType result) {
   using ValueType = typename ReducerType::value_type;
 
   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
@@ -1602,21 +1591,33 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::TeamThreadRangeBoundariesStruct<
         iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
     const FunctorType& lambda) {
-  // Extract value_type from lambda
-  using value_type = typename Kokkos::Impl::FunctorAnalysis<
-      Kokkos::Impl::FunctorPatternInterface::SCAN, void,
-      FunctorType>::value_type;
+  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType>;
+  using value_type = typename Analysis::value_type;
 
   const auto start = loop_bounds.start;
   const auto end   = loop_bounds.end;
-  // Note this thing is called .member in the CUDA specialization of
-  // TeamThreadRangeBoundariesStruct
+  //   Note this thing is called .member in the CUDA specialization of
+  //   TeamThreadRangeBoundariesStruct
   auto& member         = loop_bounds.team;
   const auto team_size = member.team_size();
   const auto team_rank = member.team_rank();
-  const auto nchunk    = (end - start + team_size - 1) / team_size;
-  value_type accum     = 0;
-  // each team has to process one or more chunks of the prefix scan
+
+#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND)
+  value_type scan_val = value_type();
+
+  if (team_rank == 0) {
+    for (iType i = start; i < end; ++i) {
+      lambda(i, scan_val, true);
+    }
+  }
+#pragma omp barrier
+#else
+  const auto nchunk = (end - start + team_size - 1) / team_size;
+  value_type accum  = 0;
+  // each team has to process one or
+  //      more chunks of the prefix scan
   for (iType i = 0; i < nchunk; ++i) {
     auto ii = start + i * team_size + team_rank;
     // local accumulation for this chunk
@@ -1634,6 +1635,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     // broadcast last value to rest of the team
     member.team_broadcast(accum, team_size - 1);
   }
+#endif
 }
 
 }  // namespace Kokkos
@@ -1686,12 +1688,10 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
 }
 
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
-    parallel_reduce(
-        const Impl::ThreadVectorRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ReducerType const& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
   using ValueType = typename ReducerType::value_type;
 
 #pragma omp declare reduction(                                               \
@@ -1756,8 +1756,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::ThreadVectorRangeBoundariesStruct<
         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
     const FunctorType& lambda) {
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = typename ValueTraits::value_type;
+  using Analysis   = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         TeamPolicy<Experimental::OpenMPTarget>,
+                                         FunctorType>;
+  using value_type = typename Analysis::value_type;
 
   value_type scan_val = value_type();
 
@@ -1771,6 +1773,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND
+#endif
+
 namespace Kokkos {
 /** \brief  Intra-team vector parallel_for. Executes lambda(iType i) for each
  * i=0..N-1.
@@ -1833,12 +1839,10 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
 
 #if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
-    parallel_reduce(
-        const Impl::TeamVectorRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ReducerType const& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
   using ValueType = typename ReducerType::value_type;
 
   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
@@ -1869,12 +1873,10 @@ KOKKOS_INLINE_FUNCTION
 }
 #else
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
-    parallel_reduce(
-        const Impl::TeamVectorRangeBoundariesStruct<
-            iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
-        const Lambda& lambda, ReducerType const& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ReducerType const& result) {
   using ValueType = typename ReducerType::value_type;
 
   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index e421edc5b..51921765b 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
@@ -53,6 +57,7 @@
 #include <Kokkos_OpenMPTarget.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 #include <sstream>
 
@@ -89,10 +94,10 @@ void OpenMPTargetInternal::fence(const std::string& name,
 }
 int OpenMPTargetInternal::concurrency() { return 128000; }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
-void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
-                                               const bool) {
+void OpenMPTargetInternal::print_configuration(std::ostream& os,
+                                               bool /*verbose*/) const {
   // FIXME_OPENMPTARGET
-  printf("Using OpenMPTarget\n");
+  os << "Using OpenMPTarget\n";
 }
 
 void OpenMPTargetInternal::impl_finalize() {
@@ -133,9 +138,13 @@ OpenMPTarget::OpenMPTarget()
 const char* OpenMPTarget::name() {
   return Impl::OpenMPTargetInternal::impl_singleton()->name();
 }
-void OpenMPTarget::print_configuration(std::ostream& stream,
-                                       const bool detail) {
-  m_space_instance->print_configuration(stream, detail);
+void OpenMPTarget::print_configuration(std::ostream& os, bool verbose) const {
+  os << "OpenMPTarget Execution Space:\n";
+  os << "  KOKKOS_ENABLE_OPENMPTARGET: yes\n";
+
+  os << "\nOpenMPTarget Runtime Configuration:\n";
+
+  m_space_instance->print_configuration(os, verbose);
 }
 
 uint32_t OpenMPTarget::impl_instance_id() const noexcept {
@@ -145,25 +154,22 @@ uint32_t OpenMPTarget::impl_instance_id() const noexcept {
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
-void OpenMPTarget::fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence(
-      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
-}
+
 void OpenMPTarget::fence(const std::string& name) {
   Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
 }
-void OpenMPTarget::impl_static_fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence(
-      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence",
-      Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
-}
+
 void OpenMPTarget::impl_static_fence(const std::string& name) {
   Impl::OpenMPTargetInternal::impl_singleton()->fence(
       name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
-void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); }
-void OpenMPTarget::impl_finalize() { m_space_instance->impl_finalize(); }
+void OpenMPTarget::impl_initialize(InitializationSettings const&) {
+  Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
+}
+void OpenMPTarget::impl_finalize() {
+  Impl::OpenMPTargetInternal::impl_singleton()->impl_finalize();
+}
 int OpenMPTarget::impl_is_initialized() {
   return Impl::OpenMPTargetInternal::impl_singleton()->impl_is_initialized();
 }
@@ -171,52 +177,9 @@ int OpenMPTarget::impl_is_initialized() {
 
 namespace Impl {
 int g_openmptarget_space_factory_initialized =
-    Kokkos::Impl::initialize_space_factory<OpenMPTargetSpaceInitializer>(
+    Kokkos::Impl::initialize_space_factory<Experimental::OpenMPTarget>(
         "160_OpenMPTarget");
 
-void OpenMPTargetSpaceInitializer::initialize(const InitArguments& args) {
-  // Prevent "unused variable" warning for 'args' input struct.  If
-  // Serial::initialize() ever needs to take arguments from the input
-  // struct, you may remove this line of code.
-  (void)args;
-
-  if (std::is_same<Kokkos::Experimental::OpenMPTarget,
-                   Kokkos::DefaultExecutionSpace>::value) {
-    Kokkos::Experimental::OpenMPTarget().impl_initialize();
-    // std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized"
-    // << std::endl ;
-  } else {
-    // std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not
-    // initialized" << std::endl ;
-  }
-}
-
-void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) {
-  if (std::is_same<Kokkos::Experimental::OpenMPTarget,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      all_spaces) {
-    if (Kokkos::Experimental::OpenMPTarget().impl_is_initialized())
-      Kokkos::Experimental::OpenMPTarget().impl_finalize();
-  }
-}
-
-void OpenMPTargetSpaceInitializer::fence() {
-  Kokkos::Experimental::OpenMPTarget::impl_static_fence();
-}
-void OpenMPTargetSpaceInitializer::fence(const std::string& name) {
-  Kokkos::Experimental::OpenMPTarget::impl_static_fence(name);
-}
-
-void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
-                                                       const bool detail) {
-  msg << "OpenMPTarget Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_OPENMPTARGET: ";
-  msg << "yes" << std::endl;
-
-  msg << "\nOpenMPTarget Runtime Configuration:" << std::endl;
-  Kokkos::Experimental::OpenMPTarget().print_configuration(msg, detail);
-}
-
 }  // namespace Impl
 }  // Namespace Kokkos
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index b49577119..8e4baf8c0 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -68,7 +68,7 @@ class OpenMPTargetInternal {
   int concurrency();
 
   //! Print configuration information to the given output stream.
-  void print_configuration(std::ostream&, const bool detail = false);
+  void print_configuration(std::ostream& os, bool verbose) const;
 
   static const char* name();
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index ab38dea02..dfb9ea70a 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -49,7 +49,6 @@
 #include <sstream>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -84,7 +83,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
 #pragma omp target teams distribute parallel for map(to : a_functor)
     for (auto i = begin; i < end; ++i) {
-      if constexpr (std::is_same<TagType, void>::value) {
+      if constexpr (std::is_void<TagType>::value) {
         a_functor(i);
       } else {
         a_functor(TagType(), i);
@@ -127,8 +126,10 @@ template <class FunctorType, class PolicyType, class ReducerType,
 struct ParallelReduceSpecialize {
   inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
-    constexpr int FunctorHasJoin = ReduceFunctorHasJoin<FunctorType>::value;
-    constexpr int UseReducerType = is_reducer_type<ReducerType>::value;
+    constexpr int FunctorHasJoin =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType>::has_join_member_function;
+    constexpr int UseReducerType = is_reducer<ReducerType>::value;
 
     std::stringstream error_message;
     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
@@ -145,17 +146,11 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
   using TagType    = typename PolicyType::work_tag;
   using ReducerTypeFwd =
-      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
-                                FunctorType, ReducerType>::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
-                         void>;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
-  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
-  using ReferenceType = typename ValueTraits::reference_type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         PolicyType, ReducerTypeFwd>;
+  using ReferenceType = typename Analysis::reference_type;
 
   using ParReduceCommon = ParallelReduceCommon<PointerType>;
 
@@ -188,7 +183,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
                                                      : f) reduction(custom \
                                                                     : result)
     for (auto i = begin; i < end; ++i) {
-      if constexpr (std::is_same<TagType, void>::value) {
+      if constexpr (std::is_void<TagType>::value) {
         f(i, result);
       } else {
         f(TagType(), i, result);
@@ -226,7 +221,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
          map(to:f) reduction(+: result)
         for (auto i = begin; i < end; ++i)
 
-          if constexpr (std::is_same<TagType, void>::value) {
+          if constexpr (std::is_void<TagType>::value) {
             f(i, result);
           } else {
             f(TagType(), i, result);
@@ -238,7 +233,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
                                                                     : result)
         for (auto i = begin; i < end; ++i)
 
-          if constexpr (std::is_same<TagType, void>::value) {
+          if constexpr (std::is_void<TagType>::value) {
             f(i, result);
           } else {
             f(TagType(), i, result);
@@ -260,7 +255,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
       }
 #pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
       for (auto i = begin; i < end; ++i) {
-        if constexpr (std::is_same<TagType, void>::value) {
+        if constexpr (std::is_void<TagType>::value) {
           f(i, result);
         } else {
           f(TagType(), i, result);
@@ -277,7 +272,10 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     const auto begin = p.begin();
     const auto end   = p.end();
 
-    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+    using FunctorAnalysis =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType>;
+    constexpr int HasInit = FunctorAnalysis::has_init_member_function;
 
     // Initialize the result pointer.
 
@@ -290,31 +288,30 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     const int max_teams =
         OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
     // Number of elements in the reduction
-    const auto value_count =
-        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+    const auto value_count = FunctorAnalysis::value_count(f);
 
     // Allocate scratch per active thread. Achieved by setting the first
     // parameter of `resize_scratch=1`.
-    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
+                                     std::numeric_limits<int64_t>::max());
     ValueType* scratch_ptr =
         static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
 
 #pragma omp target map(to : f) is_device_ptr(scratch_ptr)
     {
+      typename FunctorAnalysis::Reducer final_reducer(&f);
       // Enter this loop if the functor has an `init`
       if constexpr (HasInit) {
         // The `init` routine needs to be called on the device since it might
         // need device members.
-        ValueInit::init(f, scratch_ptr);
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        final_reducer.init(scratch_ptr);
+        final_reducer.final(scratch_ptr);
       } else {
         for (int i = 0; i < value_count; ++i) {
           static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
         }
 
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        final_reducer.final(scratch_ptr);
       }
     }
 
@@ -337,6 +334,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     map(to                                                                   \
         : f) is_device_ptr(scratch_ptr)
     {
+      typename FunctorAnalysis::Reducer final_reducer(&f);
 #pragma omp parallel
       {
         const int team_num    = omp_get_team_num();
@@ -347,13 +345,13 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
             (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
         ValueType* team_scratch =
             scratch_ptr + team_num * max_team_threads * value_count;
-        ReferenceType result = ValueInit::init(
-            f, &team_scratch[omp_get_thread_num() * value_count]);
+        ReferenceType result = final_reducer.init(
+            &team_scratch[omp_get_thread_num() * value_count]);
 
         // Accumulate partial results in thread specific storage.
 #pragma omp for simd
         for (auto i = team_begin; i < team_end; ++i) {
-          if constexpr (std::is_same<TagType, void>::value) {
+          if constexpr (std::is_void<TagType>::value) {
             f(i, result);
           } else {
             f(TagType(), i, result);
@@ -368,8 +366,8 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
           for (int i = 0; i < team_size - tree_neighbor_offset;
                i += 2 * tree_neighbor_offset) {
             const int neighbor = i + tree_neighbor_offset;
-            ValueJoin::join(f, &team_scratch[i * value_count],
-                            &team_scratch[neighbor * value_count]);
+            final_reducer.join(&team_scratch[i * value_count],
+                               &team_scratch[neighbor * value_count]);
           }
           tree_neighbor_offset *= 2;
         } while (tree_neighbor_offset < team_size);
@@ -383,18 +381,18 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     is_device_ptr(scratch_ptr)
       for (int i = 0; i < max_teams - tree_neighbor_offset;
            i += 2 * tree_neighbor_offset) {
+        typename FunctorAnalysis::Reducer final_reducer(&f);
         ValueType* team_scratch = scratch_ptr;
         const int team_offset   = max_team_threads * value_count;
-        ValueJoin::join(
-            f, &team_scratch[i * team_offset],
+        final_reducer.join(
+            &team_scratch[i * team_offset],
             &team_scratch[(i + tree_neighbor_offset) * team_offset]);
 
         // If `final` is provided by the functor.
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
-          // Do the final only once at the end.
-          if (tree_neighbor_offset * 2 >= max_teams &&
-              omp_get_team_num() == 0 && omp_get_thread_num() == 0)
-            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        // Do the final only once at the end.
+        if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 &&
+            omp_get_thread_num() == 0) {
+          final_reducer.final(scratch_ptr);
         }
       }
       tree_neighbor_offset *= 2;
@@ -422,25 +420,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using WorkRange = typename Policy::WorkRange;
 
   using ReducerTypeFwd =
-      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
-                                FunctorType, ReducerType>::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
-  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  static constexpr int HasJoin =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, Policy,
+                            FunctorType>::has_join_member_function;
+  static constexpr int UseReducer = is_reducer<ReducerType>::value;
   static constexpr int IsArray    = std::is_pointer<reference_type>::value;
 
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type>;
+                               typename Analysis::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -489,12 +485,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, Policy& arg_policy,
-      const ViewType& arg_result_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ViewType& arg_result_view,
+                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void*> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
@@ -537,28 +532,26 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   using Member    = typename Policy::member_type;
   using idx_type  = typename Policy::index_type;
 
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin   = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType>;
 
-  using value_type     = typename ValueTraits::value_type;
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
+  using value_type     = typename Analysis::value_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
 
   const FunctorType m_functor;
   const Policy m_policy;
 
   template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type
-  call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
-                const bool& is_final) const {
+  std::enable_if_t<std::is_void<TagType>::value> call_with_tag(
+      const FunctorType& f, const idx_type& idx, value_type& val,
+      const bool& is_final) const {
     f(idx, val, is_final);
   }
   template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type
-  call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
-                const bool& is_final) const {
+  std::enable_if_t<!std::is_void<TagType>::value> call_with_tag(
+      const FunctorType& f, const idx_type& idx, value_type& val,
+      const bool& is_final) const {
     f(WorkTag(), idx, val, is_final);
   }
 
@@ -582,6 +575,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
     for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
+      typename Analysis::Reducer final_reducer(&a_functor);
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -590,16 +584,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type val;
-          ValueInit::init(a_functor, &val);
+          final_reducer.init(&val);
           if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false);
           element_values(team_id, i) = val;
         }
 #pragma omp barrier
         if (omp_get_thread_num() == 0) {
           value_type sum;
-          ValueInit::init(a_functor, &sum);
+          final_reducer.init(&sum);
           for (idx_type i = 0; i < chunk_size; ++i) {
-            ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
+            final_reducer.join(&sum, &element_values(team_id, i));
             element_values(team_id, i) = sum;
           }
           chunk_values(team_id) = sum;
@@ -608,9 +602,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (omp_get_thread_num() == 0) {
           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
             value_type sum;
-            ValueInit::init(a_functor, &sum);
+            final_reducer.init(&sum);
             for (idx_type i = 0; i < n_chunks; ++i) {
-              ValueJoin::join(a_functor, &sum, &chunk_values(i));
+              final_reducer.join(&sum, &chunk_values(i));
               chunk_values(i) = sum;
             }
           }
@@ -622,6 +616,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
     for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
+      typename Analysis::Reducer final_reducer(&a_functor);
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -629,7 +624,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (team_id > 0)
           offset_value = chunk_values(team_id - 1);
         else
-          ValueInit::init(a_functor, &offset_value);
+          final_reducer.init(&offset_value);
 
 #pragma omp for
         for (idx_type i = 0; i < chunk_size; ++i) {
@@ -637,7 +632,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           value_type local_offset_value;
           if (i > 0) {
             local_offset_value = element_values(team_id, i - 1);
-            ValueJoin::join(a_functor, &local_offset_value, &offset_value);
+            // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs
+#ifdef KOKKOS_ARCH_VEGA
+            if constexpr (Analysis::has_join_member_function) {
+              if constexpr (std::is_void_v<WorkTag>)
+                a_functor.join(local_offset_value, offset_value);
+              else
+                a_functor.join(WorkTag{}, local_offset_value, offset_value);
+            } else
+              local_offset_value += offset_value;
+#else
+            final_reducer.join(&local_offset_value, &offset_value);
+#endif
           } else
             local_offset_value = offset_value;
           if (idx < N)
@@ -708,7 +714,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
       base_t::impl_execute(element_values, chunk_values, count);
 
-      const int size = base_t::ValueTraits::value_size(base_t::m_functor);
+      const int size = base_t::Analysis::value_size(base_t::m_functor);
       DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
           &m_returnvalue, chunk_values.data() + (n_chunks - 1), size);
     } else {
@@ -742,7 +748,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   const FunctorType m_functor;
   const Policy m_policy;
-  const int m_shmem_size;
+  const size_t m_shmem_size;
 
  public:
   void execute() const {
@@ -766,7 +772,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
     const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size);
     const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
+    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1,
+                                     league_size);
 
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
     FunctorType a_functor(m_functor);
@@ -781,6 +788,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
+    // If the league size is <=0, do not launch the kernel.
+    if (nteams <= 0) return;
+
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
@@ -803,7 +813,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           typename Policy::member_type team(
               league_id, league_size, team_size, vector_length, scratch_ptr,
               blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
+          if constexpr (std::is_void<TagType>::value)
             m_functor(team);
           else
             m_functor(TagType(), team);
@@ -829,17 +839,12 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
   using TagType    = typename PolicyType::work_tag;
   using ReducerTypeFwd =
-      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
-                                FunctorType, ReducerType>::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
-                         void>;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         PolicyType, ReducerTypeFwd>;
 
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
-  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
-  using ReferenceType = typename ValueTraits::reference_type;
+  using ReferenceType = typename Analysis::reference_type;
 
   using ParReduceCommon = ParallelReduceCommon<PointerType>;
 
@@ -857,7 +862,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
     OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
-                                     shmem_size_L0, shmem_size_L1);
+                                     shmem_size_L0, shmem_size_L1, league_size);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     ValueType result = ValueType();
@@ -867,6 +872,9 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
+    // If the league size is <=0, do not launch the kernel.
+    if (nteams <= 0) return;
+
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
@@ -888,7 +896,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
           typename PolicyType::member_type team(
               league_id, league_size, team_size, vector_length, scratch_ptr,
               blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
+          if constexpr (std::is_void<TagType>::value)
             f(team, result);
           else
             f(TagType(), team, result);
@@ -917,7 +925,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
     OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
-                                     shmem_size_L0, shmem_size_L1);
+                                     shmem_size_L0, shmem_size_L1, league_size);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     // Maximum active teams possible.
@@ -925,6 +933,9 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
+    // If the league size is <=0, do not launch the kernel.
+    if (nteams <= 0) return;
+
     // Case where the number of reduction items is 1.
     if constexpr (NumReductions == 1) {
       ValueType result = ValueType();
@@ -946,7 +957,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
               typename PolicyType::member_type team(
                   league_id, league_size, team_size, vector_length, scratch_ptr,
                   blockIdx, shmem_size_L0, shmem_size_L1);
-              if constexpr (std::is_same<TagType, void>::value)
+              if constexpr (std::is_void<TagType>::value)
                 f(team, result);
               else
                 f(TagType(), team, result);
@@ -973,7 +984,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
               typename PolicyType::member_type team(
                   league_id, league_size, team_size, vector_length, scratch_ptr,
                   blockIdx, shmem_size_L0, shmem_size_L1);
-              if constexpr (std::is_same<TagType, void>::value)
+              if constexpr (std::is_void<TagType>::value)
                 f(team, result);
               else
                 f(TagType(), team, result);
@@ -1004,7 +1015,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
             typename PolicyType::member_type team(
                 league_id, league_size, team_size, vector_length, scratch_ptr,
                 blockIdx, shmem_size_L0, shmem_size_L1);
-            if constexpr (std::is_same<TagType, void>::value)
+            if constexpr (std::is_void<TagType>::value)
               f(team, result);
             else
               f(TagType(), team, result);
@@ -1023,7 +1034,10 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
   // RangePolicy. Need a new implementation.
   static void execute_init_join(const FunctorType& f, const PolicyType& p,
                                 PointerType ptr, const bool ptr_on_device) {
-    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+    using FunctorAnalysis =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType,
+                              FunctorType>;
+    constexpr int HasInit = FunctorAnalysis::has_init_member_function;
 
     const int league_size   = p.league_size();
     const int team_size     = p.team_size();
@@ -1047,11 +1061,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams = league_size;
 
     // Number of elements in the reduction
-    const auto value_count =
-        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+    const auto value_count = FunctorAnalysis::value_count(f);
 
     // Allocate scratch per active thread.
-    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType),
+                                     league_size);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     // Enter this loop if the functor has an `init`
@@ -1060,10 +1074,9 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
       // device members.
 #pragma omp target map(to : f) is_device_ptr(scratch_ptr)
       {
-        ValueInit::init(f, scratch_ptr);
-
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        typename FunctorAnalysis::Reducer final_reducer(&f);
+        final_reducer.init(scratch_ptr);
+        final_reducer.final(scratch_ptr);
       }
     } else {
 #pragma omp target map(to : f) is_device_ptr(scratch_ptr)
@@ -1072,8 +1085,8 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
           static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
         }
 
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        typename FunctorAnalysis::Reducer final_reducer(&f);
+        final_reducer.final(static_cast<ValueType*>(scratch_ptr));
       }
     }
 
@@ -1102,14 +1115,15 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
         const int num_teams     = omp_get_num_teams();
         ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
                                   team_num * team_size * value_count;
-        ReferenceType result = ValueInit::init(f, &team_scratch[0]);
+        typename FunctorAnalysis::Reducer final_reducer(&f);
+        ReferenceType result = final_reducer.init(&team_scratch[0]);
 
         for (int league_id = team_num; league_id < league_size;
              league_id += num_teams) {
           typename PolicyType::member_type team(
               league_id, league_size, team_size, vector_length, scratch_ptr,
               team_num, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value) {
+          if constexpr (std::is_void<TagType>::value) {
             f(team, result);
           } else {
             f(TagType(), team, result);
@@ -1127,16 +1141,16 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
            i += 2 * tree_neighbor_offset) {
         ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);
         const int team_offset   = team_size * value_count;
-        ValueJoin::join(
-            f, &team_scratch[i * team_offset],
+        typename FunctorAnalysis::Reducer final_reducer(&f);
+        final_reducer.join(
+            &team_scratch[i * team_offset],
             &team_scratch[(i + tree_neighbor_offset) * team_offset]);
 
         // If `final` is provided by the functor.
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
-          // Do the final only once at the end.
-          if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
-              omp_get_thread_num() == 0)
-            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        // Do the final only once at the end.
+        if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+            omp_get_thread_num() == 0) {
+          final_reducer.final(scratch_ptr);
         }
       }
       tree_neighbor_offset *= 2;
@@ -1165,37 +1179,36 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
   using ReducerTypeFwd =
-      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
-                                FunctorType, ReducerType>::type;
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, ReducerTypeFwd>;
 
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-  using value_type     = typename ValueTraits::value_type;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
 
   bool m_result_ptr_on_device;
   const int m_result_ptr_num_elems;
 
-  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
-  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int HasJoin =
+      Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, Policy,
+                            FunctorType>::has_join_member_function;
+  static constexpr int UseReducer = is_reducer<ReducerType>::value;
   static constexpr int IsArray    = std::is_pointer<reference_type>::value;
 
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type>;
+                               typename Analysis::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
-  const int m_shmem_size;
+  const size_t m_shmem_size;
 
  public:
   void execute() const {
@@ -1231,12 +1244,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   }
 
   template <class ViewType>
-  ParallelReduce(
-      const FunctorType& arg_functor, const Policy& arg_policy,
-      const ViewType& arg_result,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = nullptr)
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const ViewType& arg_result,
+                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void*> = nullptr)
       : m_result_ptr_on_device(
             MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
                               typename ViewType::memory_space>::accessible),
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 40d8c45f5..2399b424f 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -48,7 +48,6 @@
 #include <omp.h>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 
 // WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly,
 // this was tracked down to a bug in clang with regards of mapping structs
@@ -68,6 +67,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   using Policy  = Kokkos::MDRangePolicy<Traits...>;
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
+  using Index   = typename Policy::index_type;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -117,21 +117,21 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <int Rank>
-  inline typename std::enable_if<Rank == 2>::type execute_tile(
+  inline std::enable_if_t<Rank == 2> execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
       const Policy& policy) const {
 #ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
     (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
 
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
 
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; ++i0) {
       for (auto i1 = begin_1; i1 < end_1; ++i1) {
-        if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+        if constexpr (std::is_void<typename Policy::work_tag>::value)
           functor(i0, i1);
         else
           functor(typename Policy::work_tag(), i0, i1);
@@ -149,7 +149,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #pragma omp for collapse(2)
     for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
       for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) {
-        if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+        if constexpr (std::is_void<typename Policy::work_tag>::value)
           functor(i0, i1);
         else
           functor(typename Policy::work_tag(), i0, i1);
@@ -158,24 +158,24 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <int Rank>
-  inline typename std::enable_if<Rank == 3>::type execute_tile(
+  inline std::enable_if_t<Rank == 3> execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
       const Policy& policy) const {
 #ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
     (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
 
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
 
 #pragma omp target teams distribute parallel for collapse(3) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; ++i0) {
       for (auto i1 = begin_1; i1 < end_1; ++i1) {
         for (auto i2 = begin_2; i2 < end_2; ++i2) {
-          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          if constexpr (std::is_void<typename Policy::work_tag>::value)
             functor(i0, i1, i2);
           else
             functor(typename Policy::work_tag(), i0, i1, i2);
@@ -199,7 +199,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
       for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
         for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) {
-          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          if constexpr (std::is_void<typename Policy::work_tag>::value)
             functor(i0, i1, i2);
           else
             functor(typename Policy::work_tag(), i0, i1, i2);
@@ -208,27 +208,27 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <int Rank>
-  inline typename std::enable_if<Rank == 4>::type execute_tile(
+  inline std::enable_if_t<Rank == 4> execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
       const Policy& policy) const {
 #ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
     (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
-    const auto begin_3 = policy.m_lower[3];
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
+    const Index begin_3 = policy.m_lower[3];
 
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
 
 #pragma omp target teams distribute parallel for collapse(4) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; ++i0) {
       for (auto i1 = begin_1; i1 < end_1; ++i1) {
         for (auto i2 = begin_2; i2 < end_2; ++i2) {
           for (auto i3 = begin_3; i3 < end_3; ++i3) {
-            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            if constexpr (std::is_void<typename Policy::work_tag>::value)
               functor(i0, i1, i2, i3);
             else
               functor(typename Policy::work_tag(), i0, i1, i2, i3);
@@ -258,7 +258,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
         for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
           for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) {
-            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            if constexpr (std::is_void<typename Policy::work_tag>::value)
               functor(i0, i1, i2, i3);
             else
               functor(typename Policy::work_tag(), i0, i1, i2, i3);
@@ -267,22 +267,22 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <int Rank>
-  inline typename std::enable_if<Rank == 5>::type execute_tile(
+  inline std::enable_if_t<Rank == 5> execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
       const Policy& policy) const {
 #ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
     (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
-    const auto begin_3 = policy.m_lower[3];
-    const auto begin_4 = policy.m_lower[4];
-
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
-    const auto end_4 = policy.m_upper[4];
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
+    const Index begin_3 = policy.m_lower[3];
+    const Index begin_4 = policy.m_lower[4];
+
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
+    const Index end_4 = policy.m_upper[4];
 
 #pragma omp target teams distribute parallel for collapse(5) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; ++i0) {
@@ -337,24 +337,24 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <int Rank>
-  inline typename std::enable_if<Rank == 6>::type execute_tile(
+  inline std::enable_if_t<Rank == 6> execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
       const Policy& policy) const {
 #ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
     (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
-    const auto begin_3 = policy.m_lower[3];
-    const auto begin_4 = policy.m_lower[4];
-    const auto begin_5 = policy.m_lower[5];
-
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
-    const auto end_4 = policy.m_upper[4];
-    const auto end_5 = policy.m_upper[5];
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
+    const Index begin_3 = policy.m_lower[3];
+    const Index begin_4 = policy.m_lower[4];
+    const Index begin_5 = policy.m_lower[5];
+
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
+    const Index end_4 = policy.m_upper[4];
+    const Index end_5 = policy.m_upper[5];
 
 #pragma omp target teams distribute parallel for collapse(6) map(to : functor)
     for (auto i0 = begin_0; i0 < end_0; ++i0) {
@@ -446,23 +446,24 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
+  using Index   = typename Policy::index_type;
 
   using ReducerConditional =
       std::conditional<std::is_same<InvalidType, ReducerType>::value,
                        FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
-                         void>;
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, ReducerTypeFwd>;
 
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
 
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
+  enum {
+    HasJoin =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, Policy,
+                              FunctorType>::has_join_member_function
+  };
+  enum { UseReducer = is_reducer<ReducerType>::value };
 
   const pointer_type m_result_ptr;
   const FunctorType m_functor;
@@ -475,7 +476,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
  public:
   inline void execute() const {
-    execute_tile<Policy::rank, typename ValueTraits::value_type>(
+    execute_tile<Policy::rank, typename Analysis::value_type>(
         m_functor, m_policy, m_result_ptr);
   }
 
@@ -483,9 +484,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   inline ParallelReduce(
       const FunctorType& arg_functor, Policy arg_policy,
       const ViewType& arg_result_view,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void*>::type = NULL)
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void*> = NULL)
       : m_result_ptr(arg_result_view.data()),
         m_functor(arg_functor),
         m_policy(arg_policy),
@@ -506,14 +507,14 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                   memory_space>::accessible) {}
 
   template <int Rank, class ValueType>
-  inline typename std::enable_if<Rank == 2>::type execute_tile(
-      const FunctorType& functor, const Policy& policy,
-      pointer_type ptr) const {
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
+  inline std::enable_if_t<Rank == 2> execute_tile(const FunctorType& functor,
+                                                  const Policy& policy,
+                                                  pointer_type ptr) const {
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
 
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
 
     ValueType result = ValueType();
 
@@ -531,7 +532,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
               : result)
       for (auto i0 = begin_0; i0 < end_0; ++i0) {
         for (auto i1 = begin_1; i1 < end_1; ++i1) {
-          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          if constexpr (std::is_void<typename Policy::work_tag>::value)
             functor(i0, i1, result);
           else
             functor(typename Policy::work_tag(), i0, i1, result);
@@ -542,7 +543,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 reduction(+:result)
       for (auto i0 = begin_0; i0 < end_0; ++i0) {
         for (auto i1 = begin_1; i1 < end_1; ++i1) {
-          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+          if constexpr (std::is_void<typename Policy::work_tag>::value)
             functor(i0, i1, result);
           else
             functor(typename Policy::work_tag(), i0, i1, result);
@@ -555,16 +556,16 @@ reduction(+:result)
   }
 
   template <int Rank, class ValueType>
-  inline typename std::enable_if<Rank == 3>::type execute_tile(
-      const FunctorType& functor, const Policy& policy,
-      pointer_type ptr) const {
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
+  inline std::enable_if_t<Rank == 3> execute_tile(const FunctorType& functor,
+                                                  const Policy& policy,
+                                                  pointer_type ptr) const {
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
 
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
 
     ValueType result = ValueType();
 
@@ -583,7 +584,7 @@ reduction(+:result)
       for (auto i0 = begin_0; i0 < end_0; ++i0) {
         for (auto i1 = begin_1; i1 < end_1; ++i1) {
           for (auto i2 = begin_2; i2 < end_2; ++i2) {
-            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            if constexpr (std::is_void<typename Policy::work_tag>::value)
               functor(i0, i1, i2, result);
             else
               functor(typename Policy::work_tag(), i0, i1, i2, result);
@@ -596,7 +597,7 @@ reduction(+:result)
       for (auto i0 = begin_0; i0 < end_0; ++i0) {
         for (auto i1 = begin_1; i1 < end_1; ++i1) {
           for (auto i2 = begin_2; i2 < end_2; ++i2) {
-            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            if constexpr (std::is_void<typename Policy::work_tag>::value)
               functor(i0, i1, i2, result);
             else
               functor(typename Policy::work_tag(), i0, i1, i2, result);
@@ -610,18 +611,18 @@ reduction(+:result)
   }
 
   template <int Rank, class ValueType>
-  inline typename std::enable_if<Rank == 4>::type execute_tile(
-      const FunctorType& functor, const Policy& policy,
-      pointer_type ptr) const {
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[3];
-    const auto begin_3 = policy.m_lower[2];
-
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
+  inline std::enable_if_t<Rank == 4> execute_tile(const FunctorType& functor,
+                                                  const Policy& policy,
+                                                  pointer_type ptr) const {
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[3];
+    const Index begin_3 = policy.m_lower[2];
+
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
 
     ValueType result = ValueType();
 
@@ -673,20 +674,20 @@ reduction(+:result)
   }
 
   template <int Rank, class ValueType>
-  inline typename std::enable_if<Rank == 5>::type execute_tile(
-      const FunctorType& functor, const Policy& policy,
-      pointer_type ptr) const {
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
-    const auto begin_3 = policy.m_lower[3];
-    const auto begin_4 = policy.m_lower[4];
-
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
-    const auto end_4 = policy.m_upper[4];
+  inline std::enable_if_t<Rank == 5> execute_tile(const FunctorType& functor,
+                                                  const Policy& policy,
+                                                  pointer_type ptr) const {
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
+    const Index begin_3 = policy.m_lower[3];
+    const Index begin_4 = policy.m_lower[4];
+
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
+    const Index end_4 = policy.m_upper[4];
 
     ValueType result = ValueType();
 
@@ -744,22 +745,22 @@ reduction(+:result)
   }
 
   template <int Rank, class ValueType>
-  inline typename std::enable_if<Rank == 6>::type execute_tile(
-      const FunctorType& functor, const Policy& policy,
-      pointer_type ptr) const {
-    const auto begin_0 = policy.m_lower[0];
-    const auto begin_1 = policy.m_lower[1];
-    const auto begin_2 = policy.m_lower[2];
-    const auto begin_3 = policy.m_lower[3];
-    const auto begin_4 = policy.m_lower[4];
-    const auto begin_5 = policy.m_lower[5];
-
-    const auto end_0 = policy.m_upper[0];
-    const auto end_1 = policy.m_upper[1];
-    const auto end_2 = policy.m_upper[2];
-    const auto end_3 = policy.m_upper[3];
-    const auto end_4 = policy.m_upper[4];
-    const auto end_5 = policy.m_upper[5];
+  inline std::enable_if_t<Rank == 6> execute_tile(const FunctorType& functor,
+                                                  const Policy& policy,
+                                                  pointer_type ptr) const {
+    const Index begin_0 = policy.m_lower[0];
+    const Index begin_1 = policy.m_lower[1];
+    const Index begin_2 = policy.m_lower[2];
+    const Index begin_3 = policy.m_lower[3];
+    const Index begin_4 = policy.m_lower[4];
+    const Index begin_5 = policy.m_lower[5];
+
+    const Index end_0 = policy.m_upper[0];
+    const Index end_1 = policy.m_upper[1];
+    const Index end_2 = policy.m_upper[2];
+    const Index end_3 = policy.m_upper[3];
+    const Index end_4 = policy.m_upper[4];
+    const Index end_5 = policy.m_upper[5];
 
     ValueType result = ValueType();
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
index a46a64ea6..1ada2b191 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY)
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 48f6b74dc..840db4327 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Concepts.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <Kokkos_SYCL.hpp>
@@ -49,6 +53,8 @@
 #include <Kokkos_Serial.hpp>
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 namespace {
 template <typename C>
@@ -69,12 +75,6 @@ struct Container {
 }  // namespace
 
 namespace Kokkos {
-
-namespace Impl {
-// forward-declaration
-int get_gpu(const InitArguments& args);
-}  // namespace Impl
-
 namespace Experimental {
 SYCL::SYCL()
     : m_space_instance(&Impl::SYCLInternal::singleton(),
@@ -105,24 +105,22 @@ bool SYCL::impl_is_initialized() {
 
 void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); }
 
-void SYCL::print_configuration(std::ostream& s, const bool detailed) {
-  s << "macro  KOKKOS_ENABLE_SYCL : defined" << '\n';
-  if (detailed)
-    SYCL::impl_sycl_info(s, m_space_instance->m_queue->get_device());
-}
+void SYCL::print_configuration(std::ostream& os, bool verbose) const {
+  os << "Devices:\n";
+  os << "  KOKKOS_ENABLE_SYCL: yes\n";
+
+  os << "\nRuntime Configuration:\n";
 
-void SYCL::fence() const {
-  fence("Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+  os << "macro  KOKKOS_ENABLE_SYCL : defined\n";
+  if (verbose)
+    SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device());
 }
+
 void SYCL::fence(const std::string& name) const {
   Impl::SYCLInternal::fence(*m_space_instance->m_queue, name,
                             impl_instance_id());
 }
 
-void SYCL::impl_static_fence() {
-  impl_static_fence(
-      "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
-}
 void SYCL::impl_static_fence(const std::string& name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<
       Kokkos::Experimental::SYCL>(
@@ -148,27 +146,23 @@ int SYCL::sycl_device() const {
   return impl_internal_space_instance()->m_syclDev;
 }
 
-SYCL::SYCLDevice::SYCLDevice(sycl::device d) : m_device(std::move(d)) {}
-
-SYCL::SYCLDevice::SYCLDevice(const sycl::device_selector& selector)
-    : m_device(selector.select_device()) {}
-
-SYCL::SYCLDevice::SYCLDevice(size_t id) {
+void SYCL::impl_initialize(InitializationSettings const& settings) {
   std::vector<sycl::device> gpu_devices =
       sycl::device::get_devices(sycl::info::device_type::gpu);
-  if (id >= gpu_devices.size()) {
-    std::stringstream error_message;
-    error_message << "Requested GPU with id " << id << " but only "
-                  << gpu_devices.size() << " GPU(s) available!\n";
-    Kokkos::Impl::throw_runtime_exception(error_message.str());
+  // If the device id is not specified and there are no GPUs, sidestep Kokkos
+  // device selection and use whatever is available (if no GPU architecture is
+  // specified).
+#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
+    !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
+    !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
+    !defined(KOKKOS_ARCH_AMPERE)
+  if (!settings.has_device_id() && gpu_devices.empty()) {
+    Impl::SYCLInternal::singleton().initialize(sycl::device());
+    return;
   }
-  m_device = gpu_devices[id];
-}
-
-sycl::device SYCL::SYCLDevice::get_device() const { return m_device; }
-
-void SYCL::impl_initialize(SYCL::SYCLDevice d) {
-  Impl::SYCLInternal::singleton().initialize(d.get_device());
+#endif
+  using Kokkos::Impl::get_gpu;
+  Impl::SYCLInternal::singleton().initialize(gpu_devices[get_gpu(settings)]);
 }
 
 std::ostream& SYCL::impl_sycl_info(std::ostream& os,
@@ -262,9 +256,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << device.get_info<device::is_linker_available>()
             << "\nQueue Profiling: "
             << device.get_info<device::queue_profiling>()
-            << "\nBuilt In Kernels: "
-            << Container<std::vector<std::string>>(
-                   device.get_info<device::built_in_kernels>())
             << "\nVendor: " << device.get_info<device::vendor>()
             << "\nProfile: " << device.get_info<device::profile>()
             << "\nVersion: " << device.get_info<device::version>()
@@ -281,54 +272,8 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
 namespace Impl {
 
 int g_sycl_space_factory_initialized =
-    Kokkos::Impl::initialize_space_factory<SYCLSpaceInitializer>("170_SYCL");
+    Kokkos::Impl::initialize_space_factory<SYCL>("170_SYCL");
 
-void SYCLSpaceInitializer::initialize(const InitArguments& args) {
-  // If there are no GPUs return whatever else we can run on if no specific GPU
-  // is requested.
-  const auto num_gpus =
-      sycl::device::get_devices(sycl::info::device_type::gpu).size();
-  int use_gpu = num_gpus == 0 ? args.device_id : Kokkos::Impl::get_gpu(args);
-
-  if (std::is_same<Kokkos::Experimental::SYCL,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      0 < use_gpu) {
-    if (use_gpu > -1) {
-      Kokkos::Experimental::SYCL::impl_initialize(
-          Kokkos::Experimental::SYCL::SYCLDevice(use_gpu));
-    } else {
-      Kokkos::Experimental::SYCL::impl_initialize(
-          Kokkos::Experimental::SYCL::SYCLDevice(sycl::default_selector()));
-    }
-  }
 }
-
-void SYCLSpaceInitializer::finalize(const bool all_spaces) {
-  if (std::is_same<Kokkos::Experimental::SYCL,
-                   Kokkos::DefaultExecutionSpace>::value ||
-      all_spaces) {
-    if (Kokkos::Experimental::SYCL::impl_is_initialized())
-      Kokkos::Experimental::SYCL::impl_finalize();
-  }
-}
-
-void SYCLSpaceInitializer::fence() {
-  Kokkos::Experimental::SYCL::impl_static_fence();
-}
-void SYCLSpaceInitializer::fence(const std::string& name) {
-  Kokkos::Experimental::SYCL::impl_static_fence(name);
-}
-
-void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
-                                               const bool detail) {
-  msg << "Devices:" << std::endl;
-  msg << "  KOKKOS_ENABLE_SYCL: ";
-  msg << "yes" << std::endl;
-
-  msg << "\nRuntime Configuration:" << std::endl;
-  Experimental::SYCL{}.print_configuration(msg, detail);
-}
-
-}  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 0cf5a95d8..37721247a 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>  //kokkos_malloc
 
 namespace Kokkos {
@@ -73,8 +77,9 @@ SYCLInternal::~SYCLInternal() {
 
 int SYCLInternal::verify_is_initialized(const char* const label) const {
   if (!is_initialized()) {
-    std::cerr << "Kokkos::Experimental::SYCL::" << label
-              << " : ERROR device not initialized" << std::endl;
+    Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label +
+                   " : ERROR device not initialized\n")
+                      .c_str());
   }
   return is_initialized();
 }
@@ -98,11 +103,7 @@ void SYCLInternal::initialize(const sycl::device& d) {
       Kokkos::Impl::throw_runtime_exception(
           "There was an asynchronous SYCL error!\n");
   };
-  // FIXME_SYCL using an in-order queue here should not be necessary since we
-  // are using submit_barrier for managing kernel dependencies but this seems to
-  // be required as a hot fix for now.
-  initialize(
-      sycl::queue{d, exception_handler, sycl::property::queue::in_order()});
+  initialize(sycl::queue{d, exception_handler});
 }
 
 // FIXME_SYCL
@@ -172,8 +173,8 @@ void SYCLInternal::initialize(const sycl::queue& q) {
   m_team_scratch_ptr          = nullptr;
 }
 
-void* SYCLInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
+sycl::device_ptr<void> SYCLInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
   if (m_team_scratch_current_size == 0) {
     m_team_scratch_current_size = bytes;
     m_team_scratch_ptr =
@@ -229,7 +230,7 @@ void SYCLInternal::finalize() {
   m_queue.reset();
 }
 
-void* SYCLInternal::scratch_space(const std::size_t size) {
+sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
   const size_type sizeScratchGrain =
       sizeof(Kokkos::Experimental::SYCL::size_type);
   if (verify_is_initialized("scratch_space") &&
@@ -255,7 +256,7 @@ void* SYCLInternal::scratch_space(const std::size_t size) {
   return m_scratchSpace;
 }
 
-void* SYCLInternal::scratch_flags(const std::size_t size) {
+sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
   const size_type sizeScratchGrain =
       sizeof(Kokkos::Experimental::SYCL::size_type);
   if (verify_is_initialized("scratch_flags") &&
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index 45aacd725..45a788787 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -66,10 +66,10 @@ class SYCLInternal {
   SYCLInternal& operator=(SYCLInternal&&) = delete;
   SYCLInternal(SYCLInternal&&)            = delete;
 
-  void* scratch_space(const std::size_t size);
-  void* scratch_flags(const std::size_t size);
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  sycl::device_ptr<void> scratch_space(const std::size_t size);
+  sycl::device_ptr<void> scratch_flags(const std::size_t size);
+  sycl::device_ptr<void> resize_team_scratch_space(std::int64_t bytes,
+                                                   bool force_shrink = false);
 
   uint32_t impl_get_instance_id() const;
   int m_syclDev = 0;
@@ -78,15 +78,15 @@ class SYCLInternal {
   uint32_t m_maxConcurrency   = 0;
   uint64_t m_maxShmemPerBlock = 0;
 
-  std::size_t m_scratchSpaceCount = 0;
-  size_type* m_scratchSpace       = nullptr;
-  std::size_t m_scratchFlagsCount = 0;
-  size_type* m_scratchFlags       = nullptr;
+  std::size_t m_scratchSpaceCount            = 0;
+  sycl::device_ptr<size_type> m_scratchSpace = nullptr;
+  std::size_t m_scratchFlagsCount            = 0;
+  sycl::device_ptr<size_type> m_scratchFlags = nullptr;
   // mutex to access shared memory
   mutable std::mutex m_mutexScratchSpace;
 
-  int64_t m_team_scratch_current_size = 0;
-  void* m_team_scratch_ptr            = nullptr;
+  int64_t m_team_scratch_current_size       = 0;
+  sycl::device_ptr<void> m_team_scratch_ptr = nullptr;
   mutable std::mutex m_team_scratch_mutex;
 
   uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index d631c3ba8..cf292f957 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -56,7 +56,7 @@ struct FunctorWrapperRangePolicyParallelFor {
 
   void operator()(sycl::item<1> item) const {
     const typename Policy::index_type id = item.get_linear_id() + m_begin;
-    if constexpr (std::is_same<WorkTag, void>::value)
+    if constexpr (std::is_void<WorkTag>::value)
       m_functor_wrapper.get_functor()(id);
     else
       m_functor_wrapper.get_functor()(WorkTag(), id);
@@ -87,9 +87,7 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
                                         const sycl::event& memcpy_event) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q                          = space.sycl_queue();
 
     auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       FunctorWrapperRangePolicyParallelFor<Functor, Policy> f{policy.begin(),
@@ -223,9 +221,7 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper,
                                  const sycl::event& memcpy_event) const {
     // Convenience references
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *m_space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = m_space.sycl_queue();
 
     if (m_policy.m_num_tiles == 0) return {};
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index eca6f3111..e980a82a5 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -58,20 +58,18 @@ namespace Kokkos {
 
 namespace Impl {
 
-template <class FunctorValueTraits>
+template <class ReducerType>
 inline constexpr bool use_shuffle_based_algorithm =
-    FunctorValueTraits::StaticValueSize > 0;
+    std::is_reference_v<typename ReducerType::reference_type>;
 
 namespace SYCLReduction {
-template <class ValueJoin, class ValueOps, typename WorkTag, typename ValueType,
-          typename ReducerType, typename FunctorType, int dim>
-std::enable_if_t<
-    !use_shuffle_based_algorithm<FunctorValueTraits<ReducerType, WorkTag>>>
-workgroup_reduction(
+template <typename ValueType, typename ReducerType, int dim>
+std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
     sycl::nd_item<dim>& item, sycl::local_ptr<ValueType> local_mem,
-    ValueType* results_ptr, ValueType* device_accessible_result_ptr,
-    const unsigned int value_count, const ReducerType& selected_reducer,
-    const FunctorType& functor, bool final, unsigned int max_size) {
+    sycl::device_ptr<ValueType> results_ptr,
+    sycl::global_ptr<ValueType> device_accessible_result_ptr,
+    const unsigned int value_count, const ReducerType& final_reducer,
+    bool final, unsigned int max_size) {
   const auto local_id = item.get_local_linear_id();
 
   // Perform the actual workgroup reduction in each subgroup
@@ -85,8 +83,7 @@ workgroup_reduction(
       std::min(local_range - id_in_sg, max_size - local_id);
   for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
     if (stride < upper_stride_bound)
-      ValueJoin::join(selected_reducer, result,
-                      &local_mem[(local_id + stride) * value_count]);
+      final_reducer.join(result, &local_mem[(local_id + stride) * value_count]);
     sycl::group_barrier(sg);
   }
   sycl::group_barrier(item.get_group());
@@ -94,8 +91,7 @@ workgroup_reduction(
   // Copy the subgroup results into the first positions of the
   // reduction array.
   if (id_in_sg == 0)
-    ValueOps::copy(functor, &local_mem[sg.get_group_id()[0] * value_count],
-                   result);
+    final_reducer.copy(&local_mem[sg.get_group_id()[0] * value_count], result);
   sycl::group_barrier(item.get_group());
 
   // Do the final reduction only using the first subgroup.
@@ -108,15 +104,15 @@ workgroup_reduction(
     for (unsigned int offset = local_range; offset < n_subgroups;
          offset += local_range)
       if (id_in_sg + offset < n_subgroups)
-        ValueJoin::join(selected_reducer, result_,
-                        &local_mem[(id_in_sg + offset) * value_count]);
+        final_reducer.join(result_,
+                           &local_mem[(id_in_sg + offset) * value_count]);
     sycl::group_barrier(sg);
 
     // Then, we proceed as before.
     for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
       if (id_in_sg + stride < n_subgroups)
-        ValueJoin::join(selected_reducer, result_,
-                        &local_mem[(id_in_sg + stride) * value_count]);
+        final_reducer.join(result_,
+                           &local_mem[(id_in_sg + stride) * value_count]);
       sycl::group_barrier(sg);
     }
 
@@ -126,32 +122,25 @@ workgroup_reduction(
     // final() if necessary.
     if (id_in_sg == 0) {
       if (final) {
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, WorkTag>::final(functor, &local_mem[0]);
+        final_reducer.final(&local_mem[0]);
         if (device_accessible_result_ptr != nullptr)
-          ValueOps::copy(functor, &device_accessible_result_ptr[0],
-                         &local_mem[0]);
+          final_reducer.copy(&device_accessible_result_ptr[0], &local_mem[0]);
         else
-          ValueOps::copy(functor, &results_ptr[0], &local_mem[0]);
+          final_reducer.copy(&results_ptr[0], &local_mem[0]);
       } else
-        ValueOps::copy(functor,
-                       &results_ptr[(item.get_group_linear_id()) * value_count],
-                       &local_mem[0]);
+        final_reducer.copy(
+            &results_ptr[(item.get_group_linear_id()) * value_count],
+            &local_mem[0]);
     }
   }
 }
 
-template <class ValueJoin, typename WorkTag, typename ValueType,
-          typename ReducerType, typename FunctorType, int dim>
-std::enable_if_t<
-    use_shuffle_based_algorithm<FunctorValueTraits<ReducerType, WorkTag>>>
-workgroup_reduction(sycl::nd_item<dim>& item,
-                    sycl::local_ptr<ValueType> local_mem, ValueType local_value,
-                    ValueType* results_ptr,
-                    ValueType* device_accessible_result_ptr,
-                    const ReducerType& selected_reducer,
-                    const FunctorType& functor, bool final,
-                    unsigned int max_size) {
+template <typename ValueType, typename ReducerType, int dim>
+std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction(
+    sycl::nd_item<dim>& item, sycl::local_ptr<ValueType> local_mem,
+    ValueType local_value, sycl::device_ptr<ValueType> results_ptr,
+    sycl::global_ptr<ValueType> device_accessible_result_ptr,
+    const ReducerType& final_reducer, bool final, unsigned int max_size) {
   const auto local_id = item.get_local_linear_id();
 
   // Perform the actual workgroup reduction in each subgroup
@@ -164,8 +153,7 @@ workgroup_reduction(sycl::nd_item<dim>& item,
       std::min(local_range - id_in_sg, max_size - local_id);
   for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
     auto tmp = sg.shuffle_down(local_value, stride);
-    if (stride < upper_stride_bound)
-      ValueJoin::join(selected_reducer, &local_value, &tmp);
+    if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp);
   }
 
   // Copy the subgroup results into the first positions of the
@@ -188,8 +176,7 @@ workgroup_reduction(sycl::nd_item<dim>& item,
       for (unsigned int offset = local_range; offset < n_active_subgroups;
            offset += local_range)
         if (id_in_sg + offset < n_active_subgroups) {
-          ValueJoin::join(selected_reducer, &sg_value,
-                          &local_mem[(id_in_sg + offset)]);
+          final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]);
         }
       sg.barrier();
     }
@@ -198,7 +185,7 @@ workgroup_reduction(sycl::nd_item<dim>& item,
     for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
       auto tmp = sg.shuffle_down(sg_value, stride);
       if (id_in_sg + stride < n_active_subgroups)
-        ValueJoin::join(selected_reducer, &sg_value, &tmp);
+        final_reducer.join(&sg_value, &tmp);
     }
 
     // Finally, we copy the workgroup results back to global memory
@@ -207,8 +194,7 @@ workgroup_reduction(sycl::nd_item<dim>& item,
     // final() if necessary.
     if (id_in_sg == 0) {
       if (final) {
-        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-          FunctorFinal<FunctorType, WorkTag>::final(functor, &sg_value);
+        final_reducer.final(&sg_value);
         if (device_accessible_result_ptr != nullptr)
           device_accessible_result_ptr[0] = sg_value;
         else
@@ -228,8 +214,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::RangePolicy<Traits...>;
 
  private:
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
   using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
   using execution_space = typename Analysis::execution_space;
   using value_type      = typename Analysis::value_type;
   using pointer_type    = typename Analysis::pointer_type;
@@ -240,9 +230,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
  public:
   // V - View
   template <typename V>
-  ParallelReduce(
-      const FunctorType& f, const Policy& p, const V& v,
-      typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
+  ParallelReduce(const FunctorType& f, const Policy& p, const V& v,
+                 std::enable_if_t<Kokkos::is_view<V>::value, void*> = nullptr)
       : m_functor(f),
         m_policy(p),
         m_result_ptr(v.data()),
@@ -272,24 +261,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       const PolicyType& policy, const FunctorWrapper& functor_wrapper,
       const ReducerWrapper& reducer_wrapper,
       const std::vector<sycl::event>& memcpy_events) const {
-    using ReducerConditional =
-        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                           FunctorType, ReducerType>;
-    using ReducerTypeFwd = typename ReducerConditional::type;
-    using WorkTagFwd =
-        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                           WorkTag, void>;
-    using ValueInit =
-        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-    using ValueJoin =
-        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = space.sycl_queue();
 
     // FIXME_SYCL optimize
     constexpr size_t wgroup_size       = 128;
@@ -300,13 +276,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
             wgroup_size,
         1);
     const unsigned int value_count =
-        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-    const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size));
-    value_type* device_accessible_result_ptr =
+        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
+    const auto results_ptr =
+        static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+            sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    sycl::global_ptr<value_type> device_accessible_result_ptr =
         m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-    auto scratch_flags = static_cast<unsigned int*>(
+    auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
         instance.scratch_flags(sizeof(unsigned int)));
 
     sycl::event last_reduction_event;
@@ -323,20 +299,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
               static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-          reference_type update =
-              ValueInit::init(selected_reducer, results_ptr);
+          typename Analysis::Reducer final_reducer(&selected_reducer);
+          reference_type update = final_reducer.init(results_ptr);
           if (size == 1) {
-            if constexpr (std::is_same<WorkTag, void>::value)
+            if constexpr (std::is_void<WorkTag>::value)
               functor(begin, update);
             else
               functor(WorkTag(), begin, update);
           }
-          if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-            FunctorFinal<FunctorType, WorkTag>::final(
-                static_cast<const FunctorType&>(functor), results_ptr);
+          final_reducer.final(results_ptr);
           if (device_accessible_result_ptr != nullptr)
-            ValueOps::copy(functor, &device_accessible_result_ptr[0],
-                           &results_ptr[0]);
+            final_reducer.copy(device_accessible_result_ptr.get(),
+                               results_ptr.get());
         });
       });
       q.ext_oneapi_submit_barrier(
@@ -377,107 +351,93 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                   static_cast<const FunctorType&>(functor),
                   static_cast<const ReducerType&>(
                       reducer_wrapper.get_functor()));
+              typename Analysis::Reducer final_reducer(&selected_reducer);
 
               using index_type       = typename Policy::index_type;
               const auto upper_bound = std::min<index_type>(
                   global_id + values_per_thread * wgroup_size, size);
 
-              if constexpr (FunctorValueTraits<ReducerTypeFwd,
-                                               WorkTagFwd>::StaticValueSize ==
-                            0) {
-                reference_type update = ValueInit::init(
-                    selected_reducer, &local_mem[local_id * value_count]);
+              if constexpr (Analysis::StaticValueSize == 0) {
+                reference_type update =
+                    final_reducer.init(&local_mem[local_id * value_count]);
                 for (index_type id = global_id; id < upper_bound;
                      id += wgroup_size) {
-                  if constexpr (std::is_same<WorkTag, void>::value)
+                  if constexpr (std::is_void<WorkTag>::value)
                     functor(id + begin, update);
                   else
                     functor(WorkTag(), id + begin, update);
                 }
                 item.barrier(sycl::access::fence_space::local_space);
 
-                SYCLReduction::workgroup_reduction<ValueJoin, ValueOps,
-                                                   WorkTag>(
+                SYCLReduction::workgroup_reduction<>(
                     item, local_mem.get_pointer(), results_ptr,
-                    device_accessible_result_ptr, value_count, selected_reducer,
-                    static_cast<const FunctorType&>(functor), false,
-                    std::min(size, wgroup_size));
+                    device_accessible_result_ptr, value_count, final_reducer,
+                    false, std::min(size, wgroup_size));
 
                 if (local_id == 0) {
-                  sycl::ext::oneapi::atomic_ref<
-                      unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                      sycl::ext::oneapi::memory_scope::device,
-                      sycl::access::address_space::global_space>
+                  sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
                       scratch_flags_ref(*scratch_flags);
                   num_teams_done[0] = ++scratch_flags_ref;
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
                   if (local_id >= n_wgroups)
-                    ValueInit::init(selected_reducer,
-                                    &local_mem[local_id * value_count]);
+                    final_reducer.init(&local_mem[local_id * value_count]);
                   else {
-                    ValueOps::copy(functor, &local_mem[local_id * value_count],
-                                   &results_ptr[local_id * value_count]);
+                    final_reducer.copy(&local_mem[local_id * value_count],
+                                       &results_ptr[local_id * value_count]);
                     for (unsigned int id = local_id + wgroup_size;
                          id < n_wgroups; id += wgroup_size) {
-                      ValueJoin::join(selected_reducer,
-                                      &local_mem[local_id * value_count],
-                                      &results_ptr[id * value_count]);
+                      final_reducer.join(&local_mem[local_id * value_count],
+                                         &results_ptr[id * value_count]);
                     }
                   }
 
-                  SYCLReduction::workgroup_reduction<ValueJoin, ValueOps,
-                                                     WorkTag>(
+                  SYCLReduction::workgroup_reduction<>(
                       item, local_mem.get_pointer(), results_ptr,
-                      device_accessible_result_ptr, value_count,
-                      selected_reducer,
-                      static_cast<const FunctorType&>(functor), true,
-                      std::min(n_wgroups, wgroup_size));
+                      device_accessible_result_ptr, value_count, final_reducer,
+                      true, std::min(n_wgroups, wgroup_size));
                 }
               } else {
                 value_type local_value;
-                reference_type update =
-                    ValueInit::init(selected_reducer, &local_value);
+                reference_type update = final_reducer.init(&local_value);
                 for (index_type id = global_id; id < upper_bound;
                      id += wgroup_size) {
-                  if constexpr (std::is_same<WorkTag, void>::value)
+                  if constexpr (std::is_void<WorkTag>::value)
                     functor(id + begin, update);
                   else
                     functor(WorkTag(), id + begin, update);
                 }
 
-                SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
+                SYCLReduction::workgroup_reduction<>(
                     item, local_mem.get_pointer(), local_value, results_ptr,
-                    device_accessible_result_ptr, selected_reducer,
-                    static_cast<const FunctorType&>(functor), false,
+                    device_accessible_result_ptr, final_reducer, false,
                     std::min(size, wgroup_size));
 
                 if (local_id == 0) {
-                  sycl::ext::oneapi::atomic_ref<
-                      unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                      sycl::ext::oneapi::memory_scope::device,
-                      sycl::access::address_space::global_space>
+                  sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                                   sycl::memory_scope::device,
+                                   sycl::access::address_space::global_space>
                       scratch_flags_ref(*scratch_flags);
                   num_teams_done[0] = ++scratch_flags_ref;
                 }
                 item.barrier(sycl::access::fence_space::local_space);
                 if (num_teams_done[0] == n_wgroups) {
                   if (local_id >= n_wgroups)
-                    ValueInit::init(selected_reducer, &local_value);
+                    final_reducer.init(&local_value);
                   else {
                     local_value = results_ptr[local_id];
                     for (unsigned int id = local_id + wgroup_size;
                          id < n_wgroups; id += wgroup_size) {
-                      ValueJoin::join(selected_reducer, &local_value,
-                                      &results_ptr[id]);
+                      final_reducer.join(&local_value, &results_ptr[id]);
                     }
                   }
 
-                  SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
+                  SYCLReduction::workgroup_reduction<>(
                       item, local_mem.get_pointer(), local_value, results_ptr,
-                      device_accessible_result_ptr, selected_reducer,
-                      static_cast<const FunctorType&>(functor), true,
+                      device_accessible_result_ptr, final_reducer, true,
                       std::min(n_wgroups, wgroup_size));
                 }
               }
@@ -495,9 +455,6 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence(
-          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence due to "
-          "inaccessible reducer result location");
     }
 
     return last_reduction_event;
@@ -543,8 +500,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::MDRangePolicy<Traits...>;
 
  private:
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
   using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
   using execution_space = typename Analysis::execution_space;
   using value_type      = typename Analysis::value_type;
   using pointer_type    = typename Analysis::pointer_type;
@@ -578,9 +539,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  public:
   // V - View
   template <typename V>
-  ParallelReduce(
-      const FunctorType& f, const Policy& p, const V& v,
-      typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
+  ParallelReduce(const FunctorType& f, const Policy& p, const V& v,
+                 std::enable_if_t<Kokkos::is_view<V>::value, void*> = nullptr)
       : m_functor(f),
         m_policy(p),
         m_space(p.space()),
@@ -612,23 +572,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       const PolicyType& policy, const FunctorWrapper& functor_wrapper,
       const ReducerWrapper& reducer_wrapper,
       const std::vector<sycl::event>& memcpy_events) const {
-    using ReducerConditional =
-        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                           FunctorType, ReducerType>;
-    using ReducerTypeFwd = typename ReducerConditional::type;
-    using WorkTagFwd =
-        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                           WorkTag, void>;
-    using ValueInit =
-        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-    using ValueJoin =
-        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
     // Convenience references
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *m_space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = m_space.sycl_queue();
 
     const typename Policy::index_type nwork = m_policy.m_num_tiles;
     const typename Policy::index_type block_size =
@@ -644,14 +591,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const auto init_size =
         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
     const unsigned int value_count =
-        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-    // FIXME_SYCL only use the first half
-    const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size));
-    value_type* device_accessible_result_ptr =
+        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
+    const auto results_ptr =
+        static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+            sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    sycl::global_ptr<value_type> device_accessible_result_ptr =
         m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-    auto scratch_flags = static_cast<unsigned int*>(
+    auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
         instance.scratch_flags(sizeof(unsigned int)));
 
     sycl::event last_reduction_event;
@@ -667,8 +613,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
               static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-          reference_type update =
-              ValueInit::init(selected_reducer, results_ptr);
+          typename Analysis::Reducer final_reducer(&selected_reducer);
+
+          reference_type update = final_reducer.init(results_ptr);
           if (size == 1) {
             Kokkos::Impl::Reduce::DeviceIterateTile<
                 Policy::rank, BarePolicy, FunctorType,
@@ -676,12 +623,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                 policy, functor, update, {1, 1, 1}, {0, 0, 0}, {0, 0, 0})
                 .exec_range();
           }
-          if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-            FunctorFinal<FunctorType, WorkTag>::final(
-                static_cast<const FunctorType&>(functor), results_ptr);
+          final_reducer.final(results_ptr);
           if (device_accessible_result_ptr)
-            ValueOps::copy(functor, &device_accessible_result_ptr[0],
-                           &results_ptr[0]);
+            final_reducer.copy(device_accessible_result_ptr.get(),
+                               results_ptr.get());
         });
       });
       q.ext_oneapi_submit_barrier(
@@ -714,6 +659,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
               static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
+          typename Analysis::Reducer final_reducer(&selected_reducer);
 
           // In the first iteration, we call functor to initialize the local
           // memory. Otherwise, the local memory is initialized with the
@@ -732,10 +678,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           const index_type n_global_y = 1;
           const index_type n_global_z = 1;
 
-          if constexpr (FunctorValueTraits<ReducerTypeFwd,
-                                           WorkTagFwd>::StaticValueSize == 0) {
-            reference_type update = ValueInit::init(
-                selected_reducer, &local_mem[local_id * value_count]);
+          if constexpr (Analysis::StaticValueSize == 0) {
+            reference_type update =
+                final_reducer.init(&local_mem[local_id * value_count]);
 
             Kokkos::Impl::Reduce::DeviceIterateTile<
                 Policy::rank, BarePolicy, FunctorType,
@@ -746,46 +691,40 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                 .exec_range();
             item.barrier(sycl::access::fence_space::local_space);
 
-            SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+            SYCLReduction::workgroup_reduction<>(
                 item, local_mem.get_pointer(), results_ptr,
-                device_accessible_result_ptr, value_count, selected_reducer,
-                static_cast<const FunctorType&>(functor), false,
+                device_accessible_result_ptr, value_count, final_reducer, false,
                 std::min(size, wgroup_size));
 
             if (local_id == 0) {
-              sycl::ext::oneapi::atomic_ref<
-                  unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                  sycl::ext::oneapi::memory_scope::device,
-                  sycl::access::address_space::global_space>
+              sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                               sycl::memory_scope::device,
+                               sycl::access::address_space::global_space>
                   scratch_flags_ref(*scratch_flags);
               num_teams_done[0] = ++scratch_flags_ref;
             }
             item.barrier(sycl::access::fence_space::local_space);
             if (num_teams_done[0] == n_wgroups) {
               if (local_id >= n_wgroups)
-                ValueInit::init(selected_reducer,
-                                &local_mem[local_id * value_count]);
+                final_reducer.init(&local_mem[local_id * value_count]);
               else {
-                ValueOps::copy(functor, &local_mem[local_id * value_count],
-                               &results_ptr[local_id * value_count]);
+                final_reducer.copy(&local_mem[local_id * value_count],
+                                   &results_ptr[local_id * value_count]);
                 for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
                      id += wgroup_size) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[local_id * value_count],
-                                  &results_ptr[id * value_count]);
+                  final_reducer.join(&local_mem[local_id * value_count],
+                                     &results_ptr[id * value_count]);
                 }
               }
 
-              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+              SYCLReduction::workgroup_reduction<>(
                   item, local_mem.get_pointer(), results_ptr,
-                  device_accessible_result_ptr, value_count, selected_reducer,
-                  static_cast<const FunctorType&>(functor), true,
-                  std::min(n_wgroups, wgroup_size));
+                  device_accessible_result_ptr, value_count, final_reducer,
+                  true, std::min(n_wgroups, wgroup_size));
             }
           } else {
             value_type local_value;
-            reference_type update =
-                ValueInit::init(selected_reducer, &local_value);
+            reference_type update = final_reducer.init(&local_value);
 
             Kokkos::Impl::Reduce::DeviceIterateTile<
                 Policy::rank, BarePolicy, FunctorType,
@@ -795,37 +734,33 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                 {global_x, global_y, global_z}, {local_x, local_y, local_z})
                 .exec_range();
 
-            SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
+            SYCLReduction::workgroup_reduction<>(
                 item, local_mem.get_pointer(), local_value, results_ptr,
-                device_accessible_result_ptr, selected_reducer,
-                static_cast<const FunctorType&>(functor), false,
+                device_accessible_result_ptr, final_reducer, false,
                 std::min(size, wgroup_size));
 
             if (local_id == 0) {
-              sycl::ext::oneapi::atomic_ref<
-                  unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                  sycl::ext::oneapi::memory_scope::device,
-                  sycl::access::address_space::global_space>
+              sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                               sycl::memory_scope::device,
+                               sycl::access::address_space::global_space>
                   scratch_flags_ref(*scratch_flags);
               num_teams_done[0] = ++scratch_flags_ref;
             }
             item.barrier(sycl::access::fence_space::local_space);
             if (num_teams_done[0] == n_wgroups) {
               if (local_id >= n_wgroups)
-                ValueInit::init(selected_reducer, &local_value);
+                final_reducer.init(&local_value);
               else {
                 local_value = results_ptr[local_id];
                 for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
                      id += wgroup_size) {
-                  ValueJoin::join(selected_reducer, &local_value,
-                                  &results_ptr[id]);
+                  final_reducer.join(&local_value, &results_ptr[id]);
                 }
               }
 
-              SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
+              SYCLReduction::workgroup_reduction<>(
                   item, local_mem.get_pointer(), local_value, results_ptr,
-                  device_accessible_result_ptr, selected_reducer,
-                  static_cast<const FunctorType&>(functor), true,
+                  device_accessible_result_ptr, final_reducer, true,
                   std::min(n_wgroups, wgroup_size));
             }
           }
@@ -843,9 +778,6 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           m_space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      m_space.fence(
-          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence after deep "
-          "copying results back");
     }
 
     return last_reduction_event;
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index e59929562..e2afc9783 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -57,9 +57,8 @@ namespace Impl {
 // At the end of this function, the subgroup scans are stored in the local array
 // such that the last value (at position n_active_subgroups-1) contains the
 // total sum.
-template <class ValueJoin, class ValueInit, int dim, typename ValueType,
-          typename FunctorType>
-void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
+template <int dim, typename ValueType, typename FunctorType>
+void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer,
                     sycl::local_ptr<ValueType> local_mem,
                     ValueType& local_value, unsigned int global_range) {
   // subgroup scans
@@ -68,7 +67,7 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
   const auto id_in_sg    = sg.get_local_id()[0];
   for (unsigned int stride = 1; stride < global_range; stride <<= 1) {
     auto tmp = sg.shuffle_up(local_value, stride);
-    if (id_in_sg >= stride) ValueJoin::join(functor, &local_value, &tmp);
+    if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp);
   }
 
   const auto max_subgroup_size = sg.get_max_local_range()[0];
@@ -79,7 +78,7 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
   if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups)
     local_mem[sg_group_id] = local_value;
   local_value = sg.shuffle_up(local_value, 1);
-  if (id_in_sg == 0) ValueInit::init(functor, &local_value);
+  if (id_in_sg == 0) final_reducer.init(&local_value);
   sycl::group_barrier(item.get_group());
 
   // scan subgroup results using the first subgroup
@@ -96,7 +95,7 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
           auto tmp = sg.shuffle_up(local_sg_value, stride);
           if (id_in_sg >= stride) {
             if (idx < n_active_subgroups)
-              ValueJoin::join(functor, &local_sg_value, &tmp);
+              final_reducer.join(&local_sg_value, &tmp);
             else
               local_sg_value = tmp;
           }
@@ -104,8 +103,8 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
         if (idx < n_active_subgroups) {
           local_mem[idx] = local_sg_value;
           if (round > 0)
-            ValueJoin::join(functor, &local_mem[idx],
-                            &local_mem[round * local_range - 1]);
+            final_reducer.join(&local_mem[idx],
+                               &local_mem[round * local_range - 1]);
         }
         if (round + 1 < n_rounds) sycl::group_barrier(sg);
       }
@@ -115,7 +114,7 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& functor,
 
   // add results to all subgroups
   if (sg_group_id > 0)
-    ValueJoin::join(functor, &local_value, &local_mem[sg_group_id - 1]);
+    final_reducer.join(&local_value, &local_mem[sg_group_id - 1]);
 }
 
 template <class FunctorType, class... Traits>
@@ -129,14 +128,12 @@ class ParallelScanSYCLBase {
   using WorkRange    = typename Policy::WorkRange;
   using LaunchBounds = typename Policy::launch_bounds;
 
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-  using ValueJoin   = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
-
  public:
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Kokkos::Experimental::SYCL::size_type;
   using index_type     = typename Policy::index_type;
@@ -174,6 +171,9 @@ class ParallelScanSYCLBase {
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
+            const FunctorType& functor = functor_wrapper.get_functor();
+            typename Analysis::Reducer final_reducer(&functor);
+
             const auto local_id  = item.get_local_linear_id();
             const auto global_id = item.get_global_linear_id();
 
@@ -182,11 +182,10 @@ class ParallelScanSYCLBase {
             if (global_id < size)
               local_value = global_mem[global_id];
             else
-              ValueInit::init(functor_wrapper.get_functor(), &local_value);
+              final_reducer.init(&local_value);
 
-            workgroup_scan<ValueJoin, ValueInit>(
-                item, functor_wrapper.get_functor(), local_mem.get_pointer(),
-                local_value, wgroup_size);
+            workgroup_scan<>(item, final_reducer, local_mem.get_pointer(),
+                             local_value, wgroup_size);
 
             if (n_wgroups > 1 && local_id == wgroup_size - 1)
               group_results[item.get_group_linear_id()] =
@@ -204,11 +203,12 @@ class ParallelScanSYCLBase {
         cgh.parallel_for(
             sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
             [=](sycl::nd_item<1> item) {
-              const auto global_id = item.get_global_linear_id();
+              const auto global_id       = item.get_global_linear_id();
+              const FunctorType& functor = functor_wrapper.get_functor();
+              typename Analysis::Reducer final_reducer(&functor);
               if (global_id < size)
-                ValueJoin::join(functor_wrapper.get_functor(),
-                                &global_mem[global_id],
-                                &group_results[item.get_group_linear_id()]);
+                final_reducer.join(&global_mem[global_id],
+                                   &group_results[item.get_group_linear_id()]);
             });
       });
       q.ext_oneapi_submit_barrier(
@@ -221,9 +221,7 @@ class ParallelScanSYCLBase {
                                  sycl::event memcpy_event) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q                          = space.sycl_queue();
 
     const std::size_t len = m_policy.end() - m_policy.begin();
 
@@ -236,9 +234,12 @@ class ParallelScanSYCLBase {
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
         const typename Policy::index_type id =
             static_cast<typename Policy::index_type>(item.get_id()) + begin;
+        const FunctorType& functor = functor_wrapper.get_functor();
+        typename Analysis::Reducer final_reducer(&functor);
+
         value_type update{};
-        ValueInit::init(functor_wrapper.get_functor(), &update);
-        if constexpr (std::is_same<WorkTag, void>::value)
+        final_reducer.init(&update);
+        if constexpr (std::is_void<WorkTag>::value)
           functor_wrapper.get_functor()(id, update, false);
         else
           functor_wrapper.get_functor()(WorkTag(), id, update, false);
@@ -258,7 +259,7 @@ class ParallelScanSYCLBase {
         auto global_id = item.get_id(0);
 
         value_type update = global_mem[global_id];
-        if constexpr (std::is_same<WorkTag, void>::value)
+        if constexpr (std::is_void<WorkTag>::value)
           functor_wrapper.get_functor()(global_id, update, true);
         else
           functor_wrapper.get_functor()(WorkTag(), global_id, update, true);
@@ -297,8 +298,8 @@ class ParallelScanSYCLBase {
 
     // FIXME_SYCL consider only storing one value per block and recreate initial
     // results in the end before doing the final pass
-    m_scratch_space =
-        static_cast<pointer_type>(instance.scratch_space(total_memory));
+    m_scratch_space = static_cast<sycl::device_ptr<value_type>>(
+        instance.scratch_space(total_memory));
 
     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
         indirectKernelMem = instance.get_indirect_kernel_mem();
@@ -346,14 +347,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   using Base = ParallelScanSYCLBase<FunctorType, Traits...>;
 
   ReturnType& m_returnvalue;
+  const Kokkos::Experimental::SYCL& m_exec;
 
   inline void execute() {
     Base::impl_execute([&]() {
       const long long nwork = Base::m_policy.end() - Base::m_policy.begin();
       if (nwork > 0) {
-        const int size = Base::ValueTraits::value_size(Base::m_functor);
-        DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace>(
-            &m_returnvalue, Base::m_scratch_space + nwork - 1, size);
+        const int size = Base::Analysis::value_size(Base::m_functor);
+        DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
+                 Kokkos::Experimental::SYCL>(
+            m_exec, &m_returnvalue, Base::m_scratch_space + nwork - 1, size);
       }
     });
   }
@@ -361,7 +364,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScanWithTotal(const FunctorType& arg_functor,
                         const typename Base::Policy& arg_policy,
                         ReturnType& arg_returnvalue)
-      : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+      : Base(arg_functor, arg_policy),
+        m_returnvalue(arg_returnvalue),
+        m_exec(arg_policy.space()) {}
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index bf37dcb26..5ac7d8af3 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -70,8 +70,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   int m_league_size;
   int m_team_size;
   int m_vector_length;
-  int m_team_scratch_size[2];
-  int m_thread_scratch_size[2];
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
   int m_chunk_size;
   bool m_tune_team_size;
   bool m_tune_vector_length;
@@ -172,15 +172,17 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
 
   int league_size() const { return m_league_size; }
 
-  int scratch_size(int level, int team_size_ = -1) const {
+  size_t scratch_size(int level, int team_size_ = -1) const {
     if (team_size_ < 0) team_size_ = m_team_size;
     return m_team_scratch_size[level] +
            team_size_ * m_thread_scratch_size[level];
   }
 
-  int team_scratch_size(int level) const { return m_team_scratch_size[level]; }
+  size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
 
-  int thread_scratch_size(int level) const {
+  size_t thread_scratch_size(int level) const {
     return m_thread_scratch_size[level];
   }
 
@@ -408,8 +410,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type const m_vector_size;
   int m_shmem_begin;
   int m_shmem_size;
-  char* m_scratch_ptr[2];
-  int m_scratch_size[2];
+  sycl::device_ptr<char> m_global_scratch_ptr;
+  size_t m_scratch_size[2];
   // Only let one ParallelFor/Reduce modify the team scratch memory. The
   // constructor acquires the mutex which is released in the destructor.
   std::scoped_lock<std::mutex> m_scratch_lock;
@@ -420,9 +422,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                  const sycl::event& memcpy_events) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q                          = space.sycl_queue();
 
     auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
@@ -430,20 +430,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       sycl::accessor<char, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           team_scratch_memory_L0(
-              sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+              sycl::range<1>(
+                  std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
               cgh);
 
       // Avoid capturing *this since it might not be trivially copyable
-      const auto shmem_begin     = m_shmem_begin;
-      const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
-      char* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
+      const auto shmem_begin       = m_shmem_begin;
+      const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+      sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
 
       auto lambda = [=](sycl::nd_item<2> item) {
         const member_type team_member(
             team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0],
-            scratch_ptr[1] + item.get_group(1) * scratch_size[1],
+            global_scratch_ptr + item.get_group(1) * scratch_size[1],
             scratch_size[1], item);
-        if constexpr (std::is_same<work_tag, void>::value)
+        if constexpr (std::is_void<work_tag>::value)
           functor_wrapper.get_functor()(team_member);
         else
           functor_wrapper.get_functor()(work_tag(), team_member);
@@ -516,13 +517,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
 
-    // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    auto& space      = *m_policy.space().impl_internal_space_instance();
-    m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] = static_cast<char*>(space.resize_team_scratch_space(
-        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
+    auto& space = *m_policy.space().impl_internal_space_instance();
+    m_global_scratch_ptr =
+        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
+            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -554,8 +554,12 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
 
  private:
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
   using Analysis =
-      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
   using member_type   = typename Policy::member_type;
   using WorkTag       = typename Policy::work_tag;
   using launch_bounds = typename Policy::launch_bounds;
@@ -574,14 +578,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
-  // FIXME_SYCL avoid reallocating memory for reductions
-  /*  size_type* m_scratch_space;
-    size_type* m_scratch_flags;
-    size_type m_team_begin;*/
   size_type m_shmem_begin;
   size_type m_shmem_size;
-  char* m_scratch_ptr[2];
-  int m_scratch_size[2];
+  sycl::device_ptr<char> m_global_scratch_ptr;
+  size_t m_scratch_size[2];
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -595,28 +595,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       const PolicyType& policy, const FunctorWrapper& functor_wrapper,
       const ReducerWrapper& reducer_wrapper,
       const std::vector<sycl::event>& memcpy_events) const {
-    using ReducerConditional =
-        Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                           FunctorType, ReducerType>;
-    using ReducerTypeFwd = typename ReducerConditional::type;
-    using WorkTagFwd =
-        std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
-                           WorkTag, void>;
-    using ValueInit =
-        Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-    using ValueJoin =
-        Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-    using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
-
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
+    sycl::queue& q = space.sycl_queue();
 
     const unsigned int value_count =
-        FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
+        Analysis::value_count(ReducerConditional::select(m_functor, m_reducer));
     std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
     value_type* results_ptr = nullptr;
 
@@ -626,9 +612,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      results_ptr = static_cast<pointer_type>(instance.scratch_space(
-          sizeof(value_type) * std::max(value_count, 1u)));
-      value_type* device_accessible_result_ptr =
+      results_ptr =
+          static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+              sizeof(value_type) * std::max(value_count, 1u)));
+      sycl::global_ptr<value_type> device_accessible_result_ptr =
           m_result_ptr_device_accessible ? m_result_ptr : nullptr;
 
       auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
@@ -637,13 +624,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         sycl::accessor<char, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             team_scratch_memory_L0(
-                sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+                sycl::range<1>(
+                    std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
                 cgh);
 
         // Avoid capturing *this since it might not be trivially copyable
-        const auto shmem_begin     = m_shmem_begin;
-        const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
-        char* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
+        const auto shmem_begin       = m_shmem_begin;
+        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
 
         cgh.depends_on(memcpy_events);
         cgh.parallel_for(
@@ -654,23 +642,22 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                   static_cast<const FunctorType&>(functor),
                   static_cast<const ReducerType&>(
                       reducer_wrapper.get_functor()));
-              reference_type update =
-                  ValueInit::init(selected_reducer, results_ptr);
+              typename Analysis::Reducer final_reducer(&selected_reducer);
+
+              reference_type update = final_reducer.init(results_ptr);
               if (size == 1) {
                 const member_type team_member(
                     team_scratch_memory_L0.get_pointer(), shmem_begin,
-                    scratch_size[0], scratch_ptr[1], scratch_size[1], item);
-                if constexpr (std::is_same<WorkTag, void>::value)
+                    scratch_size[0], global_scratch_ptr, scratch_size[1], item);
+                if constexpr (std::is_void<WorkTag>::value)
                   functor(team_member, update);
                 else
                   functor(WorkTag(), team_member, update);
               }
-              if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                FunctorFinal<FunctorType, WorkTag>::final(
-                    static_cast<const FunctorType&>(functor), results_ptr);
+              final_reducer.final(results_ptr);
               if (device_accessible_result_ptr)
-                ValueOps::copy(functor, device_accessible_result_ptr,
-                               &results_ptr[0]);
+                final_reducer.copy(device_accessible_result_ptr,
+                                   &results_ptr[0]);
             });
       });
       q.ext_oneapi_submit_barrier(
@@ -682,7 +669,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       // workgroup results back to global memory and recurse until only one
       // workgroup does the reduction and thus gets the final value.
       auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
-        auto scratch_flags = static_cast<unsigned int*>(
+        auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
             instance.scratch_flags(sizeof(unsigned int)));
 
         // FIXME_SYCL accessors seem to need a size greater than zero at least
@@ -690,145 +677,138 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         sycl::accessor<char, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             team_scratch_memory_L0(
-                sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
+                sycl::range<1>(
+                    std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))),
                 cgh);
 
         // Avoid capturing *this since it might not be trivially copyable
-        const auto shmem_begin     = m_shmem_begin;
-        const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
-        char* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
-
-        auto team_reduction_factory = [&](sycl::accessor<
-                                              value_type, 1,
-                                              sycl::access::mode::read_write,
-                                              sycl::access::target::local>
-                                              local_mem,
-                                          value_type* results_ptr) mutable {
-          value_type* device_accessible_result_ptr =
-              m_result_ptr_device_accessible ? m_result_ptr : nullptr;
-          auto lambda = [=](sycl::nd_item<2> item) {
-            auto n_wgroups =
-                item.get_group_range()[0] * item.get_group_range()[1];
-            auto wgroup_size =
-                item.get_local_range()[0] * item.get_local_range()[1];
-            auto size = n_wgroups * wgroup_size;
-
-            auto& num_teams_done = reinterpret_cast<unsigned int&>(
-                local_mem[wgroup_size * std::max(value_count, 1u)]);
-            const auto local_id          = item.get_local_linear_id();
-            const auto& functor          = functor_wrapper.get_functor();
-            const auto& selected_reducer = ReducerConditional::select(
-                static_cast<const FunctorType&>(functor),
-                static_cast<const ReducerType&>(reducer_wrapper.get_functor()));
-
-            if constexpr (FunctorValueTraits<ReducerTypeFwd,
-                                             WorkTagFwd>::StaticValueSize ==
-                          0) {
-              reference_type update = ValueInit::init(
-                  selected_reducer, &local_mem[local_id * value_count]);
-              const member_type team_member(
-                  team_scratch_memory_L0.get_pointer(), shmem_begin,
-                  scratch_size[0],
-                  scratch_ptr[1] + item.get_group(1) * scratch_size[1],
-                  scratch_size[1], item);
-              if constexpr (std::is_same<WorkTag, void>::value)
-                functor(team_member, update);
-              else
-                functor(WorkTag(), team_member, update);
-              item.barrier(sycl::access::fence_space::local_space);
-
-              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
-                  item, local_mem.get_pointer(), results_ptr,
-                  device_accessible_result_ptr, value_count, selected_reducer,
-                  static_cast<const FunctorType&>(functor), false,
-                  std::min<std::size_t>(size, item.get_local_range()[0] *
-                                                  item.get_local_range()[1]));
-
-              if (local_id == 0) {
-                sycl::ext::oneapi::atomic_ref<
-                    unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                    sycl::ext::oneapi::memory_scope::device,
-                    sycl::access::address_space::global_space>
-                    scratch_flags_ref(*scratch_flags);
-                num_teams_done = ++scratch_flags_ref;
-              }
-              sycl::group_barrier(item.get_group());
-              if (num_teams_done == n_wgroups) {
-                if (local_id >= n_wgroups)
-                  ValueInit::init(selected_reducer,
-                                  &local_mem[local_id * value_count]);
-                else {
-                  ValueOps::copy(functor, &local_mem[local_id * value_count],
-                                 &results_ptr[local_id * value_count]);
-                  for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
-                       id += wgroup_size) {
-                    ValueJoin::join(selected_reducer,
-                                    &local_mem[local_id * value_count],
-                                    &results_ptr[id * value_count]);
+        const auto shmem_begin       = m_shmem_begin;
+        const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
+        sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+
+        auto team_reduction_factory =
+            [&](sycl::accessor<value_type, 1, sycl::access::mode::read_write,
+                               sycl::access::target::local>
+                    local_mem,
+                sycl::device_ptr<value_type> results_ptr) mutable {
+              sycl::global_ptr<value_type> device_accessible_result_ptr =
+                  m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+              auto lambda = [=](sycl::nd_item<2> item) {
+                auto n_wgroups =
+                    item.get_group_range()[0] * item.get_group_range()[1];
+                auto wgroup_size =
+                    item.get_local_range()[0] * item.get_local_range()[1];
+                auto size = n_wgroups * wgroup_size;
+
+                auto& num_teams_done = reinterpret_cast<unsigned int&>(
+                    local_mem[wgroup_size * std::max(value_count, 1u)]);
+                const auto local_id          = item.get_local_linear_id();
+                const auto& functor          = functor_wrapper.get_functor();
+                const auto& selected_reducer = ReducerConditional::select(
+                    static_cast<const FunctorType&>(functor),
+                    static_cast<const ReducerType&>(
+                        reducer_wrapper.get_functor()));
+                typename Analysis::Reducer final_reducer(&selected_reducer);
+
+                if constexpr (Analysis::StaticValueSize == 0) {
+                  reference_type update =
+                      final_reducer.init(&local_mem[local_id * value_count]);
+                  const member_type team_member(
+                      team_scratch_memory_L0.get_pointer(), shmem_begin,
+                      scratch_size[0],
+                      global_scratch_ptr + item.get_group(1) * scratch_size[1],
+                      scratch_size[1], item);
+                  if constexpr (std::is_void<WorkTag>::value)
+                    functor(team_member, update);
+                  else
+                    functor(WorkTag(), team_member, update);
+                  item.barrier(sycl::access::fence_space::local_space);
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem.get_pointer(), results_ptr,
+                      device_accessible_result_ptr, value_count,
+                      selected_reducer, false,
+                      std::min<std::size_t>(size,
+                                            item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+
+                  if (local_id == 0) {
+                    sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        scratch_flags_ref(*scratch_flags);
+                    num_teams_done = ++scratch_flags_ref;
                   }
-                }
-
-                SYCLReduction::workgroup_reduction<ValueJoin, ValueOps,
-                                                   WorkTag>(
-                    item, local_mem.get_pointer(), results_ptr,
-                    device_accessible_result_ptr, value_count, selected_reducer,
-                    static_cast<const FunctorType&>(functor), true,
-                    std::min(n_wgroups, item.get_local_range()[0] *
-                                            item.get_local_range()[1]));
-              }
-            } else {
-              value_type local_value;
-              reference_type update =
-                  ValueInit::init(selected_reducer, &local_value);
-              const member_type team_member(
-                  team_scratch_memory_L0.get_pointer(), shmem_begin,
-                  scratch_size[0],
-                  scratch_ptr[1] + item.get_group(1) * scratch_size[1],
-                  scratch_size[1], item);
-              if constexpr (std::is_same<WorkTag, void>::value)
-                functor(team_member, update);
-              else
-                functor(WorkTag(), team_member, update);
-
-              SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
-                  item, local_mem.get_pointer(), local_value, results_ptr,
-                  device_accessible_result_ptr, selected_reducer,
-                  static_cast<const FunctorType&>(functor), false,
-                  std::min<std::size_t>(size, item.get_local_range()[0] *
-                                                  item.get_local_range()[1]));
-
-              if (local_id == 0) {
-                sycl::ext::oneapi::atomic_ref<
-                    unsigned, sycl::ext::oneapi::memory_order::relaxed,
-                    sycl::ext::oneapi::memory_scope::device,
-                    sycl::access::address_space::global_space>
-                    scratch_flags_ref(*scratch_flags);
-                num_teams_done = ++scratch_flags_ref;
-              }
-              item.barrier(sycl::access::fence_space::local_space);
-              if (num_teams_done == n_wgroups) {
-                if (local_id >= n_wgroups)
-                  ValueInit::init(selected_reducer, &local_value);
-                else {
-                  local_value = results_ptr[local_id];
-                  for (unsigned int id = local_id + wgroup_size; id < n_wgroups;
-                       id += wgroup_size) {
-                    ValueJoin::join(selected_reducer, &local_value,
-                                    &results_ptr[id]);
+                  sycl::group_barrier(item.get_group());
+                  if (num_teams_done == n_wgroups) {
+                    if (local_id >= n_wgroups)
+                      final_reducer.init(&local_mem[local_id * value_count]);
+                    else {
+                      final_reducer.copy(&local_mem[local_id * value_count],
+                                         &results_ptr[local_id * value_count]);
+                      for (unsigned int id = local_id + wgroup_size;
+                           id < n_wgroups; id += wgroup_size) {
+                        final_reducer.join(&local_mem[local_id * value_count],
+                                           &results_ptr[id * value_count]);
+                      }
+                    }
+
+                    SYCLReduction::workgroup_reduction<>(
+                        item, local_mem.get_pointer(), results_ptr,
+                        device_accessible_result_ptr, value_count,
+                        selected_reducer, true,
+                        std::min(n_wgroups, item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+                  }
+                } else {
+                  value_type local_value;
+                  reference_type update = final_reducer.init(&local_value);
+                  const member_type team_member(
+                      team_scratch_memory_L0.get_pointer(), shmem_begin,
+                      scratch_size[0],
+                      global_scratch_ptr + item.get_group(1) * scratch_size[1],
+                      scratch_size[1], item);
+                  if constexpr (std::is_void<WorkTag>::value)
+                    functor(team_member, update);
+                  else
+                    functor(WorkTag(), team_member, update);
+
+                  SYCLReduction::workgroup_reduction<>(
+                      item, local_mem.get_pointer(), local_value, results_ptr,
+                      device_accessible_result_ptr, final_reducer, false,
+                      std::min<std::size_t>(size,
+                                            item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
+
+                  if (local_id == 0) {
+                    sycl::atomic_ref<unsigned, sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        scratch_flags_ref(*scratch_flags);
+                    num_teams_done = ++scratch_flags_ref;
+                  }
+                  item.barrier(sycl::access::fence_space::local_space);
+                  if (num_teams_done == n_wgroups) {
+                    if (local_id >= n_wgroups)
+                      final_reducer.init(&local_value);
+                    else {
+                      local_value = results_ptr[local_id];
+                      for (unsigned int id = local_id + wgroup_size;
+                           id < n_wgroups; id += wgroup_size) {
+                        final_reducer.join(&local_value, &results_ptr[id]);
+                      }
+                    }
+
+                    SYCLReduction::workgroup_reduction<>(
+                        item, local_mem.get_pointer(), local_value, results_ptr,
+                        device_accessible_result_ptr, final_reducer, true,
+                        std::min(n_wgroups, item.get_local_range()[0] *
+                                                item.get_local_range()[1]));
                   }
                 }
-
-                SYCLReduction::workgroup_reduction<ValueJoin, WorkTag>(
-                    item, local_mem.get_pointer(), local_value, results_ptr,
-                    device_accessible_result_ptr, selected_reducer,
-                    static_cast<const FunctorType&>(functor), true,
-                    std::min(n_wgroups, item.get_local_range()[0] *
-                                            item.get_local_range()[1]));
-              }
-            }
-          };
-          return lambda;
-        };
+              };
+              return lambda;
+            };
 
         auto dummy_reduction_lambda = team_reduction_factory({1, cgh}, nullptr);
 
@@ -860,8 +840,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         const auto init_size =
             std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
-        results_ptr = static_cast<pointer_type>(instance.scratch_space(
-            sizeof(value_type) * std::max(value_count, 1u) * init_size));
+        results_ptr =
+            static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
+                sizeof(value_type) * std::max(value_count, 1u) * init_size));
 
         auto reduction_lambda = team_reduction_factory(local_mem, results_ptr);
 
@@ -885,9 +866,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence(
-          "Kokkos::Impl::ParallelReduce<TeamPolicy,SYCL>: fence because "
-          "reduction can't access result storage location");
     }
 
     return last_reduction_event;
@@ -935,13 +913,12 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
 
-    // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    auto& space      = *m_policy.space().impl_internal_space_instance();
-    m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] = static_cast<char*>(space.resize_team_scratch_space(
-        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
+    auto& space = *m_policy.space().impl_internal_space_instance();
+    m_global_scratch_ptr =
+        static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space(
+            static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size));
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -960,10 +937,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  public:
   template <class ViewType>
-  ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
-                 ViewType const& arg_result,
-                 typename std::enable_if<Kokkos::is_view<ViewType>::value,
-                                         void*>::type = nullptr)
+  ParallelReduce(
+      FunctorType const& arg_functor, Policy const& arg_policy,
+      ViewType const& arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value, void*> = nullptr)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index f9b34dca2..07ca907fa 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_HostSpace.hpp>
@@ -63,10 +67,13 @@ void DeepCopySYCL(void* dst, const void* src, size_t n) {
 
 void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
                        const void* src, size_t n) {
-  auto event =
-      instance.impl_internal_space_instance()->m_queue->memcpy(dst, src, n);
-  instance.impl_internal_space_instance()->m_queue->ext_oneapi_submit_barrier(
-      std::vector<sycl::event>{event});
+  // FIXME_SYCL memcpy doesn't respect submit_barrier which means that we need
+  // to actually fence the execution space to make sure the memcpy is properly
+  // enqueued when using out-of-order queues.
+  sycl::queue& q = *instance.impl_internal_space_instance()->m_queue;
+  q.wait_and_throw();
+  auto event = q.memcpy(dst, src, n);
+  q.ext_oneapi_submit_barrier(std::vector<sycl::event>{event});
 }
 
 void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
@@ -121,6 +128,23 @@ void* allocate_sycl(
   return hostPtr;
 }
 
+void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space,
+                                   const size_t arg_alloc_size) const {
+  return allocate(exec_space, "[unlabeled]", arg_alloc_size);
+}
+
+void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space,
+                                   const char* arg_label,
+                                   const size_t arg_alloc_size,
+                                   const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice,
+      sycl::usm::alloc::device,
+      *exec_space.impl_internal_space_instance()->m_queue);
+}
+
 void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
 }
@@ -135,6 +159,22 @@ void* SYCLDeviceUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::device, m_queue);
 }
 
+void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space,
+                                   const size_t arg_alloc_size) const {
+  return allocate(exec_space, "[unlabeled]", arg_alloc_size);
+}
+void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space,
+                                   const char* arg_label,
+                                   const size_t arg_alloc_size,
+                                   const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared,
+      sycl::usm::alloc::shared,
+      *exec_space.impl_internal_space_instance()->m_queue);
+}
+
 void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
 }
@@ -148,6 +188,21 @@ void* SYCLSharedUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::shared, m_queue);
 }
 
+void* SYCLHostUSMSpace::allocate(const SYCL& exec_space,
+                                 const size_t arg_alloc_size) const {
+  return allocate(exec_space, "[unlabeled]", arg_alloc_size);
+}
+void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, const char* arg_label,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost,
+      sycl::usm::alloc::host,
+      *exec_space.impl_internal_space_instance()->m_queue);
+}
+
 void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const {
   return allocate("[unlabeled]", arg_alloc_size);
 }
@@ -261,6 +316,56 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
       "HostSpace");
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCL& arg_exec_space,
+        const Kokkos::Experimental::SYCLDeviceUSMSpace& space,
+        const std::string& label, const size_t size,
+        const SharedAllocationRecord<void, void>::function_type dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space,
+                                                       label, size),
+          sizeof(SharedAllocationHeader) + size, dealloc, label),
+      m_space(space) {
+  SharedAllocationHeader header;
+
+  this->base_t::_fill_host_accessible_header_info(header, label);
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>(
+      arg_exec_space, RecordBase::m_alloc_ptr, &header,
+      sizeof(SharedAllocationHeader));
+}
+
+SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCL& exec_space,
+        const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(exec_space, arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space,
@@ -283,6 +388,29 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCL& exec_space,
+        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(exec_space, arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
+          arg_label),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
     SharedAllocationRecord(
         const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index bda2dfd0a..a8c60412c 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -65,7 +65,7 @@ class SYCLTeamMember {
   using scratch_memory_space = execution_space::scratch_memory_space;
 
  private:
-  mutable void* m_team_reduce;
+  mutable sycl::local_ptr<void> m_team_reduce;
   scratch_memory_space m_team_shared;
   int m_team_reduce_size;
   sycl::nd_item<2> m_item;
@@ -109,8 +109,9 @@ class SYCLTeamMember {
   //--------------------------------------------------------------------------
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic_v<ValueType>>
-  team_broadcast(ValueType& val, const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<std::is_trivially_copyable_v<ValueType>>
+      team_broadcast(ValueType& val, const int thread_id) const {
     val = sycl::group_broadcast(m_item.get_group(), val,
                                 sycl::id<2>(thread_id, 0));
   }
@@ -118,17 +119,18 @@ class SYCLTeamMember {
   // FIXME_SYCL remove/adapt this overload once the Intel oneAPI implementation
   // is conforming to the SYCL2020 standard (allowing trivially-copyable types)
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_arithmetic_v<ValueType>>
-  team_broadcast(ValueType& val, const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION
+      std::enable_if_t<!std::is_trivially_copyable_v<ValueType>>
+      team_broadcast(ValueType& val, const int thread_id) const {
     // Wait for shared data write until all threads arrive here
     sycl::group_barrier(m_item.get_group());
     if (m_item.get_local_id(1) == 0 &&
         static_cast<int>(m_item.get_local_id(0)) == thread_id) {
-      *static_cast<ValueType*>(m_team_reduce) = val;
+      *static_cast<sycl::local_ptr<ValueType>>(m_team_reduce) = val;
     }
     // Wait for shared data read until root thread writes
     sycl::group_barrier(m_item.get_group());
-    val = *(static_cast<ValueType*>(m_team_reduce));
+    val = *static_cast<sycl::local_ptr<ValueType>>(m_team_reduce);
   }
 
   template <class Closure, class ValueType>
@@ -142,17 +144,15 @@ class SYCLTeamMember {
   /**\brief  Reduction across a team
    */
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
     team_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer,
-                  typename ReducerType::value_type& value) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer,
+              typename ReducerType::value_type& value) const noexcept {
     using value_type = typename ReducerType::value_type;
 
     auto sg                       = m_item.get_sub_group();
@@ -175,8 +175,9 @@ class SYCLTeamMember {
     const unsigned int maximum_work_range =
         std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);
 
-    const auto id_in_sg  = sg.get_local_id()[0];
-    auto reduction_array = static_cast<value_type*>(m_team_reduce);
+    const auto id_in_sg = sg.get_local_id()[0];
+    auto reduction_array =
+        static_cast<sycl::local_ptr<value_type>>(m_team_reduce);
 
     // Load values into the first maximum_work_range values of the reduction
     // array in chunks. This means that only sub groups with an id in the
@@ -251,7 +252,8 @@ class SYCLTeamMember {
     }
 
     const auto n_active_subgroups = sg.get_group_range()[0];
-    const auto base_data          = static_cast<Type*>(m_team_reduce);
+    const auto base_data =
+        static_cast<sycl::local_ptr<Type>>(m_team_reduce).get();
     if (static_cast<int>(n_active_subgroups * sizeof(Type)) >
         m_team_reduce_size)
       Kokkos::abort("Not implemented!");
@@ -321,17 +323,15 @@ class SYCLTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer) const {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer,
-                    typename ReducerType::value_type& value) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  vector_reduce(ReducerType const& reducer,
+                typename ReducerType::value_type& value) const {
     const auto tidx1   = m_item.get_local_id(1);
     const auto grange1 = m_item.get_local_range(1);
 
@@ -364,12 +364,13 @@ class SYCLTeamMember {
   // Private for the driver
 
   KOKKOS_INLINE_FUNCTION
-  SYCLTeamMember(void* shared, const int shared_begin, const int shared_size,
-                 void* scratch_level_1_ptr, const int scratch_level_1_size,
-                 const sycl::nd_item<2> item)
+  SYCLTeamMember(sycl::local_ptr<void> shared, const int shared_begin,
+                 const int shared_size,
+                 sycl::device_ptr<void> scratch_level_1_ptr,
+                 const int scratch_level_1_size, const sycl::nd_item<2> item)
       : m_team_reduce(shared),
-        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
-                      scratch_level_1_ptr, scratch_level_1_size),
+        m_team_shared(static_cast<sycl::local_ptr<char>>(shared) + shared_begin,
+                      shared_size, scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
         m_item(item) {}
 
@@ -456,9 +457,9 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::SYCLTeamMember>
 TeamThreadRange(const Impl::SYCLTeamMember& thread, iType1 begin, iType2 end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -473,10 +474,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::SYCLTeamMember>
 TeamVectorRange(const Impl::SYCLTeamMember& thread, const iType1& begin,
                 const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
       thread, iType(begin), iType(end));
 }
@@ -491,10 +492,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::SYCLTeamMember>
 ThreadVectorRange(const Impl::SYCLTeamMember& thread, iType1 arg_begin,
                   iType2 arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>(
       thread, iType(arg_begin), iType(arg_end));
 }
@@ -542,11 +543,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  performed and put into result.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
   typename ReducerType::value_type value;
   reducer.init(value);
 
@@ -569,11 +569,10 @@ KOKKOS_INLINE_FUNCTION
  *  performed and put into result.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
@@ -653,11 +652,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
 }
 
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember>& loop_boundaries,
-                    const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember>& loop_boundaries,
+                const Closure& closure, const ReducerType& reducer) {
   typename ReducerType::value_type value;
   reducer.init(value);
 
@@ -676,11 +674,10 @@ KOKKOS_INLINE_FUNCTION
 }
 
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember>& loop_boundaries,
-                    const Closure& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember>& loop_boundaries,
+                const Closure& closure, ValueType& result) {
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
@@ -744,11 +741,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  *  constructed value.
  */
 template <typename iType, class Closure, class ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<is_reducer<ReducerType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember> const& loop_boundaries,
-                    Closure const& closure, ReducerType const& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember> const& loop_boundaries,
+                Closure const& closure, ReducerType const& reducer) {
   reducer.init(reducer.reference());
 
   const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
@@ -773,11 +769,10 @@ KOKKOS_INLINE_FUNCTION
  *  constructed value.
  */
 template <typename iType, class Closure, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!is_reducer<ValueType>::value>::type
-    parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::SYCLTeamMember> const& loop_boundaries,
-                    Closure const& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!is_reducer<ValueType>::value>
+parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::SYCLTeamMember> const& loop_boundaries,
+                Closure const& closure, ValueType& result) {
   result = ValueType();
 
   const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
@@ -801,11 +796,10 @@ KOKKOS_INLINE_FUNCTION
  *  The last call to closure has final == true.
  */
 template <typename iType, class Closure, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
-                      iType, Impl::SYCLTeamMember>& loop_boundaries,
-                  const Closure& closure, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                  iType, Impl::SYCLTeamMember>& loop_boundaries,
+              const Closure& closure, const ReducerType& reducer) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
similarity index 82%
rename from packages/kokkos/core/src/impl/Kokkos_Serial.cpp
rename to packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
index e5917eb59..9205e8256 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp
@@ -42,16 +42,20 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
-#if defined(KOKKOS_ENABLE_SERIAL)
 
-#include <cstdlib>
-#include <sstream>
 #include <Kokkos_Serial.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
+
+#include <cstdlib>
+#include <iostream>
 #include <sstream>
 
 /*--------------------------------------------------------------------------*/
@@ -178,11 +182,26 @@ Serial::Serial()
 }
 #endif
 
+void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const {
+  os << "Host Serial Execution Space:\n";
+  os << "  KOKKOS_ENABLE_SERIAL: yes\n";
+
+  os << "Serial Atomics:\n";
+  os << "  KOKKOS_ENABLE_SERIAL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS
+  os << "yes\n";
+#else
+  os << "no\n";
+#endif
+
+  os << "\nSerial Runtime Configuration:\n";
+}
+
 bool Serial::impl_is_initialized() {
   return Impl::SerialInternal::singleton().is_initialized();
 }
 
-void Serial::impl_initialize() {
+void Serial::impl_initialize(InitializationSettings const&) {
   Impl::SerialInternal::singleton().initialize();
 }
 
@@ -193,44 +212,7 @@ const char* Serial::name() { return "Serial"; }
 namespace Impl {
 
 int g_serial_space_factory_initialized =
-    initialize_space_factory<SerialSpaceInitializer>("100_Serial");
-
-void SerialSpaceInitializer::initialize(const InitArguments& args) {
-  // Prevent "unused variable" warning for 'args' input struct.  If
-  // Serial::initialize() ever needs to take arguments from the input
-  // struct, you may remove this line of code.
-  (void)args;
-
-  // Always initialize Serial if it is configure time enabled
-  Kokkos::Serial::impl_initialize();
-}
-
-void SerialSpaceInitializer::finalize(const bool) {
-  if (Kokkos::Serial::impl_is_initialized()) Kokkos::Serial::impl_finalize();
-}
-
-void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); }
-void SerialSpaceInitializer::fence(const std::string& name) {
-  Kokkos::Serial::impl_static_fence(name);
-}
-
-void SerialSpaceInitializer::print_configuration(std::ostream& msg,
-                                                 const bool detail) {
-  msg << "Host Serial Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_SERIAL: ";
-  msg << "yes" << std::endl;
-
-  msg << "Serial Atomics:" << std::endl;
-  msg << "  KOKKOS_ENABLE_SERIAL_ATOMICS: ";
-#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS
-  msg << "yes" << std::endl;
-#else
-  msg << "no" << std::endl;
-#endif
-
-  msg << "\nSerial Runtime Configuration:" << std::endl;
-  Serial::print_configuration(msg, detail);
-}
+    initialize_space_factory<Serial>("100_Serial");
 
 }  // namespace Impl
 
@@ -243,7 +225,3 @@ constexpr DeviceType DeviceTypeTraits<Serial>::id;
 #endif
 
 }  // namespace Kokkos
-
-#else
-void KOKKOS_CORE_SRC_IMPL_SERIAL_PREVENT_LINK_ERROR() {}
-#endif  // defined( KOKKOS_ENABLE_SERIAL )
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
new file mode 100644
index 000000000..d726a86f7
--- /dev/null
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP
+#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                  Kokkos::Serial> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
+
+  const FunctorType m_functor;
+  const MDRangePolicy m_mdr_policy;
+  const Policy m_policy;
+
+  void exec() const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      iterate_type(m_mdr_policy, m_functor)(i);
+    }
+  }
+
+ public:
+  inline void execute() const { this->exec(); }
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+  inline ParallelFor(const FunctorType& arg_functor,
+                     const MDRangePolicy& arg_policy)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+};
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+                     Kokkos::Serial> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+
+  using WorkTag = typename MDRangePolicy::work_tag;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
+
+  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
+                                   MDRangePolicy, ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
+
+  using iterate_type =
+      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
+                                             WorkTag, reference_type>;
+
+  const FunctorType m_functor;
+  const MDRangePolicy m_mdr_policy;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+
+  inline void exec(reference_type update) const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      iterate_type(m_mdr_policy, m_functor, update)(i);
+    }
+  }
+
+ public:
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+  inline void execute() const {
+    const size_t pool_reduce_size =
+        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
+    const size_t team_reduce_size  = 0;  // Never shrinks
+    const size_t team_shared_size  = 0;  // Never shrinks
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    pointer_type ptr =
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
+
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    reference_type update = final_reducer.init(ptr);
+
+    this->exec(update);
+
+    final_reducer.final(ptr);
+  }
+
+  template <class HostViewType>
+  ParallelReduce(const FunctorType& arg_functor,
+                 const MDRangePolicy& arg_policy,
+                 const HostViewType& arg_result_view,
+                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void*> = nullptr)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<HostViewType>::value,
+                  "Kokkos::Serial reduce result must be a View");
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Serial reduce result must be a View in HostSpace");
+  }
+
+  inline ParallelReduce(const FunctorType& arg_functor,
+                        MDRangePolicy arg_policy, const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                                    , Kokkos::HostSpace >::value
+      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+      );*/
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
new file mode 100644
index 000000000..84262227f
--- /dev/null
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
@@ -0,0 +1,337 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP
+#define KOKKO_SERIAL_PARALLEL_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> {
+ private:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  std::enable_if_t<std::is_void<TagType>::value> exec() const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(i);
+    }
+  }
+
+  template <class TagType>
+  std::enable_if_t<!std::is_void<TagType>::value> exec() const {
+    const TagType t{};
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(t, i);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    this->template exec<typename Policy::work_tag>();
+  }
+
+  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+/*--------------------------------------------------------------------------*/
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+                     Kokkos::Serial> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
+
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline std::enable_if_t<std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(i, update);
+    }
+  }
+
+  template <class TagType>
+  inline std::enable_if_t<!std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const TagType t{};
+
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(t, i, update);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const size_t pool_reduce_size =
+        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
+    const size_t team_reduce_size  = 0;  // Never shrinks
+    const size_t team_shared_size  = 0;  // Never shrinks
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    pointer_type ptr =
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
+
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    reference_type update = final_reducer.init(ptr);
+
+    this->template exec<WorkTag>(update);
+
+    final_reducer.final(ptr);
+  }
+
+  template <class HostViewType>
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const HostViewType& arg_result_view,
+                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void*> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<HostViewType>::value,
+                  "Kokkos::Serial reduce result must be a View");
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Kokkos::Serial reduce result must be a View in HostSpace");
+  }
+
+  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
+                        const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                                    , Kokkos::HostSpace >::value
+      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+      );*/
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::Serial> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline std::enable_if_t<std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(i, update, true);
+    }
+  }
+
+  template <class TagType>
+  inline std::enable_if_t<!std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const TagType t{};
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(t, i, update, true);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
+    const size_t team_reduce_size  = 0;  // Never shrinks
+    const size_t team_shared_size  = 0;  // Never shrinks
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    reference_type update = final_reducer.init(pointer_type(
+        internal_instance->m_thread_team_data.pool_reduce_local()));
+
+    this->template exec<WorkTag>(update);
+  }
+
+  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+/*--------------------------------------------------------------------------*/
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Serial> {
+ private:
+  using Policy  = Kokkos::RangePolicy<Traits...>;
+  using WorkTag = typename Policy::work_tag;
+
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  ReturnType& m_returnvalue;
+
+  template <class TagType>
+  inline std::enable_if_t<std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(i, update, true);
+    }
+  }
+
+  template <class TagType>
+  inline std::enable_if_t<!std::is_void<TagType>::value> exec(
+      reference_type update) const {
+    const TagType t{};
+    const typename Policy::member_type e = m_policy.end();
+    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+      m_functor(t, i, update, true);
+    }
+  }
+
+ public:
+  inline void execute() {
+    const size_t pool_reduce_size  = Analysis::value_size(m_functor);
+    const size_t team_reduce_size  = 0;  // Never shrinks
+    const size_t team_shared_size  = 0;  // Never shrinks
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    typename Analysis::Reducer final_reducer(&m_functor);
+
+    reference_type update = final_reducer.init(pointer_type(
+        internal_instance->m_thread_team_data.pool_reduce_local()));
+
+    this->template exec<WorkTag>(update);
+
+    m_returnvalue = update;
+  }
+
+  inline ParallelScanWithTotal(const FunctorType& arg_functor,
+                               const Policy& arg_policy,
+                               ReturnType& arg_returnvalue)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_returnvalue(arg_returnvalue) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
new file mode 100644
index 000000000..782ae75fe
--- /dev/null
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp
@@ -0,0 +1,424 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP
+#define KOKKO_SERIAL_PARALLEL_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+/*
+ * < Kokkos::Serial , WorkArgTag >
+ * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
+ * Kokkos::DefaultExecutionSpace >::value >::type >
+ *
+ */
+template <class... Properties>
+class TeamPolicyInternal<Kokkos::Serial, Properties...>
+    : public PolicyTraits<Properties...> {
+ private:
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+  int m_league_size;
+  int m_chunk_size;
+
+ public:
+  //! Tag this class as a kokkos execution policy
+  using execution_policy = TeamPolicyInternal;
+
+  using traits = PolicyTraits<Properties...>;
+
+  //! Execution space of this execution policy:
+  using execution_space = Kokkos::Serial;
+
+  const typename traits::execution_space& space() const {
+    static typename traits::execution_space m_space;
+    return m_space;
+  }
+
+  template <class ExecSpace, class... OtherProperties>
+  friend class TeamPolicyInternal;
+
+  template <class... OtherProperties>
+  TeamPolicyInternal(
+      const TeamPolicyInternal<Kokkos::Serial, OtherProperties...>& p) {
+    m_league_size            = p.m_league_size;
+    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size             = p.m_chunk_size;
+  }
+
+  //----------------------------------------
+
+  template <class FunctorType>
+  int team_size_max(const FunctorType&, const ParallelForTag&) const {
+    return 1;
+  }
+  template <class FunctorType>
+  int team_size_max(const FunctorType&, const ParallelReduceTag&) const {
+    return 1;
+  }
+  template <class FunctorType, class ReducerType>
+  int team_size_max(const FunctorType&, const ReducerType&,
+                    const ParallelReduceTag&) const {
+    return 1;
+  }
+  template <class FunctorType>
+  int team_size_recommended(const FunctorType&, const ParallelForTag&) const {
+    return 1;
+  }
+  template <class FunctorType>
+  int team_size_recommended(const FunctorType&,
+                            const ParallelReduceTag&) const {
+    return 1;
+  }
+  template <class FunctorType, class ReducerType>
+  int team_size_recommended(const FunctorType&, const ReducerType&,
+                            const ParallelReduceTag&) const {
+    return 1;
+  }
+
+  //----------------------------------------
+
+  inline int team_size() const { return 1; }
+  inline bool impl_auto_team_size() const { return false; }
+  inline bool impl_auto_vector_length() const { return false; }
+  inline void impl_set_team_size(size_t) {}
+  inline void impl_set_vector_length(size_t) {}
+  inline int league_size() const { return m_league_size; }
+  inline size_t scratch_size(const int& level, int = 0) const {
+    return m_team_scratch_size[level] + m_thread_scratch_size[level];
+  }
+
+  inline int impl_vector_length() const { return 1; }
+  inline static int vector_length_max() {
+    return 1024;
+  }  // Use arbitrary large number, is meant as a vectorizable length
+
+  inline static int scratch_size_max(int level) {
+    return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024);
+  }
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal(const execution_space&, int league_size_request,
+                     int team_size_request, int /* vector_length_request */ = 1)
+      : m_team_scratch_size{0, 0},
+        m_thread_scratch_size{0, 0},
+        m_league_size(league_size_request),
+        m_chunk_size(32) {
+    if (team_size_request > 1)
+      Kokkos::abort("Kokkos::abort: Requested Team Size is too large!");
+  }
+
+  TeamPolicyInternal(const execution_space& space, int league_size_request,
+                     const Kokkos::AUTO_t& /**team_size_request*/,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(space, league_size_request, -1,
+                           vector_length_request) {}
+
+  TeamPolicyInternal(const execution_space& space, int league_size_request,
+                     const Kokkos::AUTO_t& /* team_size_request */
+                     ,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space, league_size_request, -1, -1) {}
+
+  TeamPolicyInternal(const execution_space& space, int league_size_request,
+                     int team_size_request,
+                     const Kokkos::AUTO_t& /* vector_length_request */
+                     )
+      : TeamPolicyInternal(space, league_size_request, team_size_request, -1) {}
+
+  TeamPolicyInternal(int league_size_request,
+                     const Kokkos::AUTO_t& team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(),
+                           league_size_request, team_size_request,
+                           vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_request,
+                     const Kokkos::AUTO_t& team_size_request,
+                     const Kokkos::AUTO_t& vector_length_request)
+      : TeamPolicyInternal(typename traits::execution_space(),
+                           league_size_request, team_size_request,
+                           vector_length_request) {}
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     const Kokkos::AUTO_t& vector_length_request)
+      : TeamPolicyInternal(typename traits::execution_space(),
+                           league_size_request, team_size_request,
+                           vector_length_request) {}
+
+  TeamPolicyInternal(int league_size_request, int team_size_request,
+                     int vector_length_request = 1)
+      : TeamPolicyInternal(typename traits::execution_space(),
+                           league_size_request, team_size_request,
+                           vector_length_request) {}
+
+  inline int chunk_size() const { return m_chunk_size; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal& set_chunk_size(
+      typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(const int& level,
+                                              const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  }
+
+  /** \brief set per thread scratch size for a specific level of the scratch
+   * hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  /** \brief set per thread and per team scratch size for a specific level of
+   * the scratch hierarchy */
+  inline TeamPolicyInternal& set_scratch_size(
+      const int& level, const PerTeamValue& per_team,
+      const PerThreadValue& per_thread) {
+    m_team_scratch_size[level]   = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  }
+
+  using member_type = Impl::HostThreadTeamMember<Kokkos::Serial>;
+};
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Serial> {
+ private:
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>;
+  using Member = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const int m_league;
+  const size_t m_shared;
+
+  template <class TagType>
+  inline std::enable_if_t<std::is_void<TagType>::value> exec(
+      HostThreadTeamData& data) const {
+    for (int ileague = 0; ileague < m_league; ++ileague) {
+      m_functor(Member(data, ileague, m_league));
+    }
+  }
+
+  template <class TagType>
+  inline std::enable_if_t<!std::is_void<TagType>::value> exec(
+      HostThreadTeamData& data) const {
+    const TagType t{};
+    for (int ileague = 0; ileague < m_league; ++ileague) {
+      m_functor(t, Member(data, ileague, m_league));
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const size_t pool_reduce_size  = 0;  // Never shrinks
+    const size_t team_reduce_size  = TEAM_REDUCE_SIZE;
+    const size_t team_shared_size  = m_shared;
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    this->template exec<typename Policy::work_tag>(
+        internal_instance->m_thread_team_data);
+  }
+
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league(arg_policy.league_size()),
+        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
+};
+
+/*--------------------------------------------------------------------------*/
+
+template <class FunctorType, class ReducerType, class... Properties>
+class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                     ReducerType, Kokkos::Serial> {
+ private:
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>;
+
+  using Member  = typename Policy::member_type;
+  using WorkTag = typename Policy::work_tag;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
+
+  using Analysis =
+      FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const int m_league;
+  const ReducerType m_reducer;
+  pointer_type m_result_ptr;
+  size_t m_shared;
+
+  template <class TagType>
+  inline std::enable_if_t<std::is_void<TagType>::value> exec(
+      HostThreadTeamData& data, reference_type update) const {
+    for (int ileague = 0; ileague < m_league; ++ileague) {
+      m_functor(Member(data, ileague, m_league), update);
+    }
+  }
+
+  template <class TagType>
+  inline std::enable_if_t<!std::is_void<TagType>::value> exec(
+      HostThreadTeamData& data, reference_type update) const {
+    const TagType t{};
+
+    for (int ileague = 0; ileague < m_league; ++ileague) {
+      m_functor(t, Member(data, ileague, m_league), update);
+    }
+  }
+
+ public:
+  inline void execute() const {
+    const size_t pool_reduce_size =
+        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
+
+    const size_t team_reduce_size  = TEAM_REDUCE_SIZE;
+    const size_t team_shared_size  = m_shared;
+    const size_t thread_local_size = 0;  // Never shrinks
+
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
+
+    pointer_type ptr =
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
+
+    typename Analysis::Reducer final_reducer(
+        &ReducerConditional::select(m_functor, m_reducer));
+
+    reference_type update = final_reducer.init(ptr);
+
+    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
+
+    final_reducer.final(ptr);
+  }
+
+  template <class ViewType>
+  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
+                 const ViewType& arg_result,
+                 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void*> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league(arg_policy.league_size()),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(m_functor, 1)) {
+    static_assert(Kokkos::is_view<ViewType>::value,
+                  "Reduction result on Kokkos::Serial must be a Kokkos::View");
+
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space,
+                                        Kokkos::HostSpace>::accessible,
+        "Reduction result on Kokkos::Serial must be a Kokkos::View in "
+        "HostSpace");
+  }
+
+  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
+                        const ReducerType& reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_league(arg_policy.league_size()),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                            , Kokkos::HostSpace >::value
+    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+    );*/
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.cpp
similarity index 92%
rename from packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
rename to packages/kokkos/core/src/Serial/Kokkos_Serial_Task.cpp
index 179c55b10..468f27eeb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.cpp
@@ -42,12 +42,16 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_SERIAL) && defined(KOKKOS_ENABLE_TASKDAG)
+#if defined(KOKKOS_ENABLE_TASKDAG)
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Serial_Task.hpp>
+#include <Serial/Kokkos_Serial_Task.hpp>
 #include <impl/Kokkos_TaskQueue_impl.hpp>
 
 //----------------------------------------------------------------------------
@@ -63,5 +67,4 @@ template class TaskQueue<Kokkos::Serial, typename Kokkos::Serial::memory_space>;
 
 #else
 void KOKKOS_CORE_SRC_IMPL_SERIAL_TASK_PREVENT_LINK_ERROR() {}
-#endif /* #if defined( KOKKOS_ENABLE_SERIAL ) && defined( \
-          KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
similarity index 98%
rename from packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
rename to packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
index be732f448..8d8c1d748 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp
@@ -61,7 +61,7 @@ namespace Kokkos {
 namespace Impl {
 
 template <class QueueType>
-class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
+class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType>> {
  public:
   // Note: Scheduler may be an incomplete type at class scope (but not inside
   // of the methods, obviously)
@@ -131,8 +131,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
 template <class Scheduler>
 class TaskQueueSpecializationConstrained<
     Scheduler,
-    typename std::enable_if<std::is_same<typename Scheduler::execution_space,
-                                         Kokkos::Serial>::value>::type> {
+    std::enable_if_t<std::is_same<typename Scheduler::execution_space,
+                                  Kokkos::Serial>::value>> {
  public:
   // Note: Scheduler may be an incomplete type at class scope (but not inside
   // of the methods, obviously)
diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_UniqueToken.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_UniqueToken.hpp
new file mode 100644
index 000000000..cc845f3dc
--- /dev/null
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_UniqueToken.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SERIAL_UNIQUE_TOKEN_HPP
+#define KOKKOS_SERIAL_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_UniqueToken.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+template <>
+class UniqueToken<Serial, UniqueTokenScope::Instance> {
+ public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const& = execution_space()) noexcept {}
+
+  /// \brief create object size for requested size on given instance
+  ///
+  /// It is the users responsibility to only acquire size tokens concurrently
+  UniqueToken(size_type, execution_space const& = execution_space()) {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int) const noexcept {}
+};
+
+template <>
+class UniqueToken<Serial, UniqueTokenScope::Global> {
+ public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const& = execution_space()) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int) const noexcept {}
+};
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp
similarity index 95%
rename from packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
rename to packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp
index 0f6ad5cb0..05980170b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp
@@ -58,13 +58,13 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<!std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 9682564ee..346eb1dc0 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -42,8 +42,11 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_THREADS)
 
 #include <cstdint>
 #include <limits>
@@ -51,12 +54,14 @@
 #include <iostream>
 #include <sstream>
 #include <thread>
+#include <mutex>
 
 #include <Kokkos_Core.hpp>
 
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -64,6 +69,26 @@
 namespace Kokkos {
 namespace Impl {
 namespace {
+std::mutex host_internal_cppthread_mutex;
+
+// std::thread compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+void internal_cppthread_driver() {
+  try {
+    ThreadsExec::driver();
+  } catch (const std::exception &x) {
+    std::cerr << "Exception thrown from worker thread: " << x.what()
+              << std::endl;
+    std::cerr.flush();
+    std::abort();
+  } catch (...) {
+    std::cerr << "Exception thrown from worker thread" << std::endl;
+    std::cerr.flush();
+    std::abort();
+  }
+}
 
 ThreadsExec s_threads_process;
 ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr};
@@ -110,6 +135,34 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) {
 namespace Kokkos {
 namespace Impl {
 
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+void ThreadsExec::spawn() {
+  std::thread t(internal_cppthread_driver);
+  t.detach();
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process() {
+  static const std::thread::id master_pid = std::this_thread::get_id();
+
+  return master_pid == std::this_thread::get_id();
+}
+
+void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); }
+
+void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); }
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield(volatile int &flag, const int value) {
+  while (value == flag) {
+    std::this_thread::yield();
+  }
+}
+
 void execute_function_noop(ThreadsExec &, const void *) {}
 
 void ThreadsExec::driver() {
@@ -144,11 +197,11 @@ ThreadsExec::ThreadsExec()
     ThreadsExec *const nil = nullptr;
 
     // Which entry in 's_threads_exec', possibly determined from hwloc binding
-    const int entry =
-        ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
-            ? ((size_t)s_current_function_arg)
-            : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0],
-                                                     s_threads_coord));
+    const int entry = reinterpret_cast<size_t>(s_current_function_arg) <
+                              size_t(s_thread_pool_size[0])
+                          ? reinterpret_cast<size_t>(s_current_function_arg)
+                          : size_t(Kokkos::hwloc::bind_this_thread(
+                                s_thread_pool_size[0], s_threads_coord));
 
     // Given a good entry set this thread in the 's_threads_exec' array
     if (entry < s_thread_pool_size[0] &&
@@ -297,7 +350,7 @@ void ThreadsExec::fence(const std::string &name) {
 void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
   internal_fence((is_static == Impl::fence_is_static::no)
                      ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
-                     : "Kokkos::ThreadsExec::fence: Unnamed Global Fence",
+                     : "Kokkos::ThreadsExec::fence: Unnamed Static Fence",
                  is_static);
 }
 
@@ -582,9 +635,12 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) {
 
 int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; }
 
-void ThreadsExec::initialize(unsigned thread_count, unsigned use_numa_count,
-                             unsigned use_cores_per_numa,
-                             bool allow_asynchronous_threadpool) {
+void ThreadsExec::initialize(int thread_count_arg) {
+  // legacy arguments
+  unsigned thread_count       = thread_count_arg == -1 ? 0 : thread_count_arg;
+  unsigned use_numa_count     = 0;
+  unsigned use_cores_per_numa = 0;
+  bool allow_asynchronous_threadpool = false;
   // need to provide an initializer for Intel compilers
   static const Sentinel sentinel = {};
 
@@ -637,7 +693,7 @@ void ThreadsExec::initialize(unsigned thread_count, unsigned use_numa_count,
       // choose its own entry in 's_threads_coord'
       // otherwise specify the entry.
       s_current_function_arg =
-          (void *)static_cast<uintptr_t>(hwloc_can_bind ? ~0u : ith);
+          reinterpret_cast<void *>(hwloc_can_bind ? ~0u : ith);
 
       // Make sure all outstanding memory writes are complete
       // before spawning the new thread.
@@ -804,9 +860,6 @@ void ThreadsExec::finalize() {
 namespace Kokkos {
 
 int Threads::concurrency() { return impl_thread_pool_size(0); }
-void Threads::fence() const {
-  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::no);
-}
 void Threads::fence(const std::string &name) const {
   Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
 }
@@ -834,56 +887,7 @@ const char *Threads::name() { return "Threads"; }
 namespace Impl {
 
 int g_threads_space_factory_initialized =
-    initialize_space_factory<ThreadsSpaceInitializer>("050_Threads");
-
-void ThreadsSpaceInitializer::initialize(const InitArguments &args) {
-  const int num_threads = args.num_threads;
-  const int use_numa    = args.num_numa;
-  if (std::is_same<Kokkos::Threads, Kokkos::DefaultExecutionSpace>::value ||
-      std::is_same<Kokkos::Threads,
-                   Kokkos::HostSpace::execution_space>::value) {
-    if (num_threads > 0) {
-      if (use_numa > 0) {
-        Kokkos::Threads::impl_initialize(num_threads, use_numa);
-      } else {
-        Kokkos::Threads::impl_initialize(num_threads);
-      }
-    } else {
-      Kokkos::Threads::impl_initialize();
-    }
-    // std::cout << "Kokkos::initialize() fyi: CppThread enabled and
-    // initialized"
-    // << std::endl ;
-  } else {
-    // std::cout << "Kokkos::initialize() fyi: CppThread enabled but not
-    // initialized" << std::endl ;
-  }
-}
-
-void ThreadsSpaceInitializer::finalize(const bool all_spaces) {
-  if (std::is_same<Kokkos::Threads, Kokkos::DefaultExecutionSpace>::value ||
-      std::is_same<Kokkos::Threads,
-                   Kokkos::HostSpace::execution_space>::value ||
-      all_spaces) {
-    if (Kokkos::Threads::impl_is_initialized())
-      Kokkos::Threads::impl_finalize();
-  }
-}
-
-void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); }
-void ThreadsSpaceInitializer::fence(const std::string &name) {
-  Kokkos::Threads::impl_static_fence(name);
-}
-
-void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
-                                                  const bool detail) {
-  msg << "Host Parallel Execution Space:" << std::endl;
-  msg << "  KOKKOS_ENABLE_THREADS: ";
-  msg << "yes" << std::endl;
-
-  msg << "\nThreads Runtime Configuration:" << std::endl;
-  Kokkos::Threads::print_configuration(msg, detail);
-}
+    initialize_space_factory<Threads>("050_Threads");
 
 }  // namespace Impl
 
@@ -896,8 +900,3 @@ constexpr DeviceType DeviceTypeTraits<Threads>::id;
 #endif
 
 } /* namespace Kokkos */
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-#else
-void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
-#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index d17f417bb..238a76554 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -46,17 +46,14 @@
 #define KOKKOS_THREADSEXEC_HPP
 
 #include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_THREADS)
 
 #include <cstdio>
 
 #include <utility>
 #include <impl/Kokkos_Spinwait.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
-#include <Kokkos_UniqueToken.hpp>
 #include <impl/Kokkos_ConcurrentBitset.hpp>
 
 //----------------------------------------------------------------------------
@@ -99,7 +96,7 @@ class ThreadsExec {
 
   void *m_scratch;
   int m_scratch_reduce_end;
-  int m_scratch_thread_end;
+  size_t m_scratch_thread_end;
   int m_numa_rank;
   int m_numa_core_rank;
   int m_pool_rank;
@@ -167,9 +164,7 @@ class ThreadsExec {
 
   static int is_initialized();
 
-  static void initialize(unsigned thread_count, unsigned use_numa_count,
-                         unsigned use_cores_per_numa,
-                         bool allow_asynchronous_threadpool);
+  static void initialize(int thread_count);
 
   static void finalize();
 
@@ -189,7 +184,7 @@ class ThreadsExec {
     // Make sure there is enough scratch space:
     const int rev_rank = m_pool_size - (m_pool_rank + 1);
 
-    *((volatile int *)reduce_memory()) = value;
+    *static_cast<volatile int *>(reduce_memory()) = value;
 
     memory_fence();
 
@@ -210,11 +205,12 @@ class ThreadsExec {
       int accum = 0;
 
       for (int rank = 0; rank < m_pool_size; ++rank) {
-        accum += *((volatile int *)get_thread(rank)->reduce_memory());
+        accum +=
+            *static_cast<volatile int *>(get_thread(rank)->reduce_memory());
       }
 
       for (int rank = 0; rank < m_pool_size; ++rank) {
-        *((volatile int *)get_thread(rank)->reduce_memory()) = accum;
+        *static_cast<volatile int *>(get_thread(rank)->reduce_memory()) = accum;
       }
 
       memory_fence();
@@ -224,7 +220,7 @@ class ThreadsExec {
       }
     }
 
-    return *((volatile int *)reduce_memory());
+    return *static_cast<volatile int *>(reduce_memory());
   }
 
   inline void barrier() {
@@ -258,11 +254,8 @@ class ThreadsExec {
   //------------------------------------
   // All-thread functions:
 
-  template <class FunctorType, class ArgTag>
+  template <class FunctorType>
   inline void fan_in_reduce(const FunctorType &f) const {
-    using Join  = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>;
-    using Final = Kokkos::Impl::FunctorFinal<FunctorType, ArgTag>;
-
     const int rev_rank = m_pool_size - (m_pool_rank + 1);
 
     for (int i = 0; i < m_pool_fan_size; ++i) {
@@ -270,11 +263,15 @@ class ThreadsExec {
 
       Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active);
 
-      Join::join(f, reduce_memory(), fan.reduce_memory());
+      f.join(
+          reinterpret_cast<typename FunctorType::value_type *>(reduce_memory()),
+          reinterpret_cast<const typename FunctorType::value_type *>(
+              fan.reduce_memory()));
     }
 
     if (!rev_rank) {
-      Final::final(f, reduce_memory());
+      f.final(reinterpret_cast<typename FunctorType::value_type *>(
+          reduce_memory()));
     }
 
     //  This thread has updated 'reduce_memory()' and upon returning
@@ -298,7 +295,7 @@ class ThreadsExec {
     }
   }
 
-  template <class FunctorType, class ArgTag>
+  template <class FunctorType>
   inline void scan_large(const FunctorType &f) {
     // Sequence of states:
     //  0) Active             : entry and exit state
@@ -307,14 +304,10 @@ class ThreadsExec {
     //  3) Rendezvous         : All threads inclusive scan value are available
     //  4) ScanCompleted      : exclusive scan value copied
 
-    using Traits = Kokkos::Impl::FunctorValueTraits<FunctorType, ArgTag>;
-    using Join   = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>;
-    using Init   = Kokkos::Impl::FunctorValueInit<FunctorType, ArgTag>;
-
-    using scalar_type = typename Traits::value_type;
+    using scalar_type = typename FunctorType::value_type;
 
     const int rev_rank   = m_pool_size - (m_pool_rank + 1);
-    const unsigned count = Traits::value_count(f);
+    const unsigned count = FunctorType::value_count(f);
 
     scalar_type *const work_value = (scalar_type *)reduce_memory();
 
@@ -325,7 +318,7 @@ class ThreadsExec {
 
       // Wait: Active -> ReductionAvailable (or ScanAvailable)
       Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active);
-      Join::join(f, work_value, fan.reduce_memory());
+      f.join(work_value, fan.reduce_memory());
     }
 
     // Copy reduction value to scan value before releasing from this phase.
@@ -347,8 +340,7 @@ class ThreadsExec {
         Impl::spinwait_while_equal<int>(th.m_pool_state,
                                         ThreadsExec::ReductionAvailable);
 
-        Join::join(f, work_value + count,
-                   ((scalar_type *)th.reduce_memory()) + count);
+        f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count);
       }
 
       // This thread has completed inclusive scan
@@ -388,7 +380,7 @@ class ThreadsExec {
         work_value[j] = src_value[j];
       }
     } else {
-      (void)Init::init(f, work_value);
+      f.init(work_value);
     }
 
     //--------------------------------
@@ -411,16 +403,12 @@ class ThreadsExec {
     }
   }
 
-  template <class FunctorType, class ArgTag>
+  template <class FunctorType>
   inline void scan_small(const FunctorType &f) {
-    using Traits = Kokkos::Impl::FunctorValueTraits<FunctorType, ArgTag>;
-    using Join   = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>;
-    using Init   = Kokkos::Impl::FunctorValueInit<FunctorType, ArgTag>;
-
-    using scalar_type = typename Traits::value_type;
+    using scalar_type = typename FunctorType::value_type;
 
     const int rev_rank   = m_pool_size - (m_pool_rank + 1);
-    const unsigned count = Traits::value_count(f);
+    const unsigned count = f.length();
 
     scalar_type *const work_value = (scalar_type *)reduce_memory();
 
@@ -452,9 +440,9 @@ class ThreadsExec {
           for (unsigned i = 0; i < count; ++i) {
             ptr[i] = ptr_prev[i + count];
           }
-          Join::join(f, ptr + count, ptr);
+          f.join(ptr + count, ptr);
         } else {
-          (void)Init::init(f, ptr);
+          f.init(ptr);
         }
         ptr_prev = ptr;
       }
@@ -626,139 +614,24 @@ inline int Threads::impl_is_initialized() {
   return Impl::ThreadsExec::is_initialized();
 }
 
-inline void Threads::impl_initialize(unsigned threads_count,
-                                     unsigned use_numa_count,
-                                     unsigned use_cores_per_numa,
-                                     bool allow_asynchronous_threadpool) {
-  Impl::ThreadsExec::initialize(threads_count, use_numa_count,
-                                use_cores_per_numa,
-                                allow_asynchronous_threadpool);
+inline void Threads::impl_initialize(InitializationSettings const &settings) {
+  Impl::ThreadsExec::initialize(
+      settings.has_num_threads() ? settings.get_num_threads() : -1);
 }
 
 inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); }
 
-inline void Threads::print_configuration(std::ostream &s, const bool detail) {
-  Impl::ThreadsExec::print_configuration(s, detail);
-}
+inline void Threads::print_configuration(std::ostream &os, bool verbose) const {
+  os << "Host Parallel Execution Space:\n";
+  os << "  KOKKOS_ENABLE_THREADS: yes\n";
 
-inline void Threads::impl_static_fence() {
-  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::yes);
+  os << "\nThreads Runtime Configuration:\n";
+  Impl::ThreadsExec::print_configuration(os, verbose);
 }
+
 inline void Threads::impl_static_fence(const std::string &name) {
   Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
 }
 } /* namespace Kokkos */
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-template <>
-class UniqueToken<Threads, UniqueTokenScope::Instance> {
- private:
-  using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>;
-  int m_count;
-  buffer_type m_buffer_view;
-  uint32_t volatile *m_buffer;
-
- public:
-  using execution_space = Threads;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const & = execution_space()) noexcept
-      : m_count(::Kokkos::Threads::impl_thread_pool_size()),
-        m_buffer_view(buffer_type()),
-        m_buffer(nullptr) {}
-
-  UniqueToken(size_type max_size, execution_space const & = execution_space())
-      : m_count(max_size > ::Kokkos::Threads::impl_thread_pool_size()
-                    ? ::Kokkos::Threads::impl_thread_pool_size()
-                    : max_size),
-        m_buffer_view(
-            max_size > ::Kokkos::Threads::impl_thread_pool_size()
-                ? buffer_type()
-                : buffer_type("UniqueToken::m_buffer_view",
-                              ::Kokkos::Impl::concurrent_bitset::buffer_bound(
-                                  m_count))),
-        m_buffer(m_buffer_view.data()) {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept { return m_count; }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept {
-    KOKKOS_IF_ON_HOST((
-        if (m_buffer == nullptr) {
-          return Threads::impl_thread_pool_rank();
-        } else {
-          const ::Kokkos::pair<int, int> result =
-              ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
-                  m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
-
-          if (result.first < 0) {
-            ::Kokkos::abort(
-                "UniqueToken<Threads> failure to acquire tokens, no tokens "
-                "available");
-          }
-          return result.first;
-        }))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int i) const noexcept {
-    KOKKOS_IF_ON_HOST((if (m_buffer != nullptr) {
-      ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
-    }))
-
-    KOKKOS_IF_ON_DEVICE(((void)i;))
-  }
-};
-
-template <>
-class UniqueToken<Threads, UniqueTokenScope::Global> {
- public:
-  using execution_space = Threads;
-  using size_type       = int;
-
-  /// \brief create object size for concurrency on the given instance
-  ///
-  /// This object should not be shared between instances
-  UniqueToken(execution_space const & = execution_space()) noexcept {}
-
-  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int size() const noexcept {
-    KOKKOS_IF_ON_HOST((return Threads::impl_thread_pool_size();))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief acquire value such that 0 <= value < size()
-  KOKKOS_INLINE_FUNCTION
-  int acquire() const noexcept {
-    KOKKOS_IF_ON_HOST((return Threads::impl_thread_pool_rank();))
-
-    KOKKOS_IF_ON_DEVICE((return 0;))
-  }
-
-  /// \brief release a value acquired by generate
-  KOKKOS_INLINE_FUNCTION
-  void release(int) const noexcept {}
-};
-
-}  // namespace Experimental
-}  // namespace Kokkos
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-#endif
 #endif /* #define KOKKOS_THREADSEXEC_HPP */
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index 36d6a25b0..02ce93250 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -46,13 +46,11 @@
 #define KOKKOS_THREADSTEAM_HPP
 
 #include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_THREADS)
 
 #include <cstdio>
 
 #include <utility>
 #include <impl/Kokkos_Spinwait.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
@@ -82,7 +80,7 @@ class ThreadsExecTeamMember {
   ThreadsExec* const m_exec;
   ThreadsExec* const* m_team_base;  ///< Base for team fan-in
   space m_team_shared;
-  int m_team_shared_size;
+  size_t m_team_shared_size;
   int m_team_size;
   int m_team_rank;
   int m_team_rank_rev;
@@ -97,9 +95,9 @@ class ThreadsExecTeamMember {
   int m_team_alloc;
 
   inline void set_team_shared() {
-    new (&m_team_shared)
-        space(((char*)(*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE,
-              m_team_shared_size);
+    new (&m_team_shared) space(
+        static_cast<char*>((*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE,
+        m_team_shared_size);
   }
 
  public:
@@ -208,7 +206,7 @@ class ThreadsExecTeamMember {
 
   template <typename Type>
   KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<!Kokkos::is_reducer<Type>::value, Type>::type
+      std::enable_if_t<!Kokkos::is_reducer<Type>::value, Type>
       team_reduce(const Type& value) const {
     KOKKOS_IF_ON_DEVICE((return value;))
 
@@ -240,61 +238,60 @@ class ThreadsExecTeamMember {
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
     team_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
   KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
+      std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
       team_reduce(const ReducerType& reducer,
                   const typename ReducerType::value_type contribution) const {
     KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;))
 
-    KOKKOS_IF_ON_HOST(
-        (using value_type = typename ReducerType::value_type;
-         // Make sure there is enough scratch space:
-         using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE,
-                                    value_type, void>::type;
+    KOKKOS_IF_ON_HOST((
+        using value_type = typename ReducerType::value_type;
+        // Make sure there is enough scratch space:
+        using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE,
+                                   value_type, void>::type;
 
-         if (nullptr == m_exec) return;
+        if (nullptr == m_exec) return;
 
-         type* const local_value = ((type*)m_exec->scratch_memory());
+        type* const local_value = ((type*)m_exec->scratch_memory());
 
-         // Set this thread's contribution
-         if (team_rank() != team_size() - 1)* local_value = contribution;
+        // Set this thread's contribution
+        if (team_rank() != team_size() - 1) { *local_value = contribution; }
 
-         // Fence to make sure the base team member has access:
-         memory_fence();
+        // Fence to make sure the base team member has access:
+        memory_fence();
 
-         if (team_fan_in()) {
-           // The last thread to synchronize returns true, all other threads
-           // wait for team_fan_out()
-           type* const team_value = ((type*)m_team_base[0]->scratch_memory());
+        if (team_fan_in()) {
+          // The last thread to synchronize returns true, all other threads
+          // wait for team_fan_out()
+          type* const team_value = ((type*)m_team_base[0]->scratch_memory());
 
-           *team_value = contribution;
-           // Join to the team value:
-           for (int i = 1; i < m_team_size; ++i) {
-             reducer.join(*team_value,
-                          *((type*)m_team_base[i]->scratch_memory()));
-           }
+          *team_value = contribution;
+          // Join to the team value:
+          for (int i = 1; i < m_team_size; ++i) {
+            reducer.join(*team_value,
+                         *((type*)m_team_base[i]->scratch_memory()));
+          }
 
-           // Team base thread may "lap" member threads so copy out to their
-           // local value.
-           for (int i = 1; i < m_team_size; ++i) {
-             *((type*)m_team_base[i]->scratch_memory()) = *team_value;
-           }
+          // Team base thread may "lap" member threads so copy out to their
+          // local value.
+          for (int i = 1; i < m_team_size; ++i) {
+            *((type*)m_team_base[i]->scratch_memory()) = *team_value;
+          }
 
-           // Fence to make sure all team members have access
-           memory_fence();
-         }
+          // Fence to make sure all team members have access
+          memory_fence();
+        }
 
-         team_fan_out();
+        team_fan_out();
 
-         // Value was changed by the team base
-         reducer.reference() = *((type volatile const*)local_value);))
+        // Value was changed by the team base
+        reducer.reference() = *local_value;))
   }
 
   /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
@@ -374,7 +371,7 @@ class ThreadsExecTeamMember {
   ThreadsExecTeamMember(
       Impl::ThreadsExec* exec,
       const TeamPolicyInternal<Kokkos::Threads, Properties...>& team,
-      const int shared_size)
+      const size_t shared_size)
       : m_exec(exec),
         m_team_base(nullptr),
         m_team_shared(nullptr, 0),
@@ -415,7 +412,7 @@ class ThreadsExecTeamMember {
       if (league_iter_end > team.league_size())
         league_iter_end = team.league_size();
 
-      if ((team.team_alloc() > m_team_size)
+      if ((team.team_alloc() > size_t(m_team_size))
               ? (team_rank_rev >= m_team_size)
               : (m_exec->pool_size() - pool_num_teams * m_team_size >
                  m_exec->pool_rank()))
@@ -525,7 +522,7 @@ class ThreadsExecTeamMember {
   }
 
   void set_league_shmem(const int arg_league_rank, const int arg_league_size,
-                        const int arg_shmem_size) {
+                        const size_t arg_shmem_size) {
     m_league_rank      = arg_league_rank;
     m_league_size      = arg_league_size;
     m_team_shared_size = arg_shmem_size;
@@ -666,7 +663,7 @@ class TeamPolicyInternal<Kokkos::Threads, Properties...>
 
   inline int team_size() const { return m_team_size; }
   inline int impl_vector_length() const { return 1; }
-  inline int team_alloc() const { return m_team_alloc; }
+  inline size_t team_alloc() const { return m_team_alloc; }
   inline int league_size() const { return m_league_size; }
 
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
@@ -828,11 +825,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::ThreadsExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::ThreadsExecTeamMember>
 TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType1& begin,
                 const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType,
                                                Impl::ThreadsExecTeamMember>(
       thread, iType(begin), iType(end));
@@ -850,11 +846,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::ThreadsExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::ThreadsExecTeamMember>
 TeamVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType1& begin,
                 const iType2& end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::TeamThreadRangeBoundariesStruct<iType,
                                                Impl::ThreadsExecTeamMember>(
       thread, iType(begin), iType(end));
@@ -872,11 +867,10 @@ KOKKOS_INLINE_FUNCTION
 
 template <typename iType1, typename iType2>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type,
-    Impl::ThreadsExecTeamMember>
+    std::common_type_t<iType1, iType2>, Impl::ThreadsExecTeamMember>
 ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread,
                   const iType1& arg_begin, const iType2& arg_end) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType,
                                                  Impl::ThreadsExecTeamMember>(
       thread, iType(arg_begin), iType(arg_end));
@@ -919,11 +913,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  * and a summation of val is performed and put into result.
  */
 template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-                    const Lambda& lambda, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
   ValueType intermediate;
   Sum<ValueType> sum(intermediate);
   sum.init(intermediate);
@@ -940,11 +933,10 @@ KOKKOS_INLINE_FUNCTION
 }
 
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
-                        iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-                    const Lambda& lambda, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
+                    iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, const ReducerType& reducer) {
   typename ReducerType::value_type value;
   reducer.init(value);
 
@@ -984,11 +976,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
  * and a summation of val is performed and put into result.
  */
 template <typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type
-    parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-                    const Lambda& lambda, ValueType& result) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, ValueType& result) {
   result = ValueType();
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
@@ -997,11 +988,10 @@ KOKKOS_INLINE_FUNCTION
 }
 
 template <typename iType, class Lambda, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
-                        iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-                    const Lambda& lambda, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                    iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+                const Lambda& lambda, const ReducerType& reducer) {
   reducer.init(reducer.reference());
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
@@ -1062,8 +1052,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
     const Impl::ThreadVectorRangeBoundariesStruct<
         iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
     const FunctorType& lambda) {
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
-  using value_type  = typename ValueTraits::value_type;
+  using value_type =
+      typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                     TeamPolicy<Threads>,
+                                     FunctorType>::value_type;
 
   value_type scan_val = value_type();
 
@@ -1080,11 +1072,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
  *
  */
 template <typename iType, class FunctorType, typename ReducerType>
-KOKKOS_INLINE_FUNCTION
-    typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type
-    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
-                      iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
-                  const FunctorType& lambda, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
+parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
+                  iType, Impl::ThreadsExecTeamMember>& loop_boundaries,
+              const FunctorType& lambda, const ReducerType& reducer) {
   typename ReducerType::value_type scan_val;
   reducer.init(scan_val);
 
@@ -1137,5 +1128,4 @@ KOKKOS_INLINE_FUNCTION void single(
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-#endif
 #endif /* #define KOKKOS_THREADSTEAM_HPP */
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
deleted file mode 100644
index 88dc670fa..000000000
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADS_PARALLEL_HPP
-#define KOKKOS_THREADS_PARALLEL_HPP
-
-#include <Kokkos_Macros.hpp>
-#if defined(KOKKOS_ENABLE_THREADS)
-
-#include <Kokkos_Parallel.hpp>
-
-#include <impl/Kokkos_FunctorAdapter.hpp>
-
-#include <KokkosExp_MDRangePolicy.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-/* ParallelFor Kokkos::Threads with RangePolicy */
-
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
-                  Kokkos::Threads> {
- private:
-  using Policy    = Kokkos::RangePolicy<Traits...>;
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member ibeg,
-                 const Member iend) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i);
-    }
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member ibeg,
-                 const Member iend) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                              range.end());
-
-    exec.fan_in();
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
-          self.m_policy.begin();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-      ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class... Traits>
-class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
-                  Kokkos::Threads> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag = typename MDRangePolicy::work_tag;
-
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using iterate_type = typename Kokkos::Impl::HostIterateTile<
-      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
-
-  const FunctorType m_functor;
-  const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
-                          // ).set_chunk_size(1) in ctor
-
-  inline static void exec_range(const MDRangePolicy &mdr_policy,
-                                const FunctorType &functor, const Member ibeg,
-                                const Member iend) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      iterate_type(mdr_policy, functor)(i);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, range.begin(),
-                            range.end());
-
-    exec.fan_in();
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index = exec.get_work_index();
-
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-
-      ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, begin, end);
-      work_index = exec.get_work_index();
-    }
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::start(&ParallelFor::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
-
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-//----------------------------------------------------------------------------
-/* ParallelFor Kokkos::Threads with TeamPolicy */
-
-template <class FunctorType, class... Properties>
-class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                  Kokkos::Threads> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const int m_shared;
-
-  template <class TagType, class Schedule>
-  inline static typename std::enable_if<
-      std::is_same<TagType, void>::value &&
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_team(const FunctorType &functor, Member member) {
-    for (; member.valid_static(); member.next_static()) {
-      functor(member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static typename std::enable_if<
-      !std::is_same<TagType, void>::value &&
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_team(const FunctorType &functor, Member member) {
-    const TagType t{};
-    for (; member.valid_static(); member.next_static()) {
-      functor(t, member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static typename std::enable_if<
-      std::is_same<TagType, void>::value &&
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_team(const FunctorType &functor, Member member) {
-    for (; member.valid_dynamic(); member.next_dynamic()) {
-      functor(member);
-    }
-  }
-
-  template <class TagType, class Schedule>
-  inline static typename std::enable_if<
-      !std::is_same<TagType, void>::value &&
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_team(const FunctorType &functor, Member member) {
-    const TagType t{};
-    for (; member.valid_dynamic(); member.next_dynamic()) {
-      functor(t, member);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelFor &self = *((const ParallelFor *)arg);
-
-    ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>(
-        self.m_functor, Member(&exec, self.m_policy, self.m_shared));
-
-    exec.barrier();
-    exec.fan_in();
-  }
-  template <typename Policy>
-  Policy fix_policy(Policy policy) {
-    if (policy.impl_vector_length() < 0) {
-      policy.impl_set_vector_length(1);
-    }
-    if (policy.team_size() < 0) {
-      policy.impl_set_team_size(
-          policy.team_size_recommended(m_functor, ParallelForTag{}));
-    }
-    return policy;
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(
-        0, Policy::member_type::team_reduce_size() + m_shared);
-
-    ThreadsExec::start(&ParallelFor::exec, this);
-
-    ThreadsExec::fence();
-  }
-
-  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {}
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-/* ParallelReduce with Kokkos::Threads and RangePolicy */
-
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
-                     Kokkos::Threads> {
- private:
-  using Policy = Kokkos::RangePolicy<Traits...>;
-
-  using WorkTag   = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit  = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update);
-    }
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    ParallelReduce::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(),
-        ValueInit::init(
-            ReducerConditional::select(self.m_functor, self.m_reducer),
-            exec.reduce_memory()));
-
-    exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>(
-        ReducerConditional::select(self.m_functor, self.m_reducer));
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin() - self.m_policy.begin(),
-                        range.end() - self.m_policy.begin(),
-                        self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index       = exec.get_work_index();
-    reference_type update = ValueInit::init(
-        ReducerConditional::select(self.m_functor, self.m_reducer),
-        exec.reduce_memory());
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
-          self.m_policy.begin();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-      ParallelReduce::template exec_range<WorkTag>(self.m_functor, begin, end,
-                                                   update);
-      work_index = exec.get_work_index();
-    }
-
-    exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>(
-        ReducerConditional::select(self.m_functor, self.m_reducer));
-  }
-
- public:
-  inline void execute() const {
-    if (m_policy.end() <= m_policy.begin()) {
-      if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        ValueFinal::final(ReducerConditional::select(m_functor, m_reducer),
-                          m_result_ptr);
-      }
-    } else {
-      ThreadsExec::resize_scratch(
-          ValueTraits::value_size(
-              ReducerConditional::select(m_functor, m_reducer)),
-          0);
-
-      ThreadsExec::start(&ParallelReduce::exec, this);
-
-      ThreadsExec::fence();
-
-      if (m_result_ptr) {
-        const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
-
-        const unsigned n = ValueTraits::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-        for (unsigned i = 0; i < n; ++i) {
-          m_result_ptr[i] = data[i];
-        }
-      }
-    }
-  }
-
-  template <class HostViewType>
-  ParallelReduce(
-      const FunctorType &arg_functor, const Policy &arg_policy,
-      const HostViewType &arg_result_view,
-      typename std::enable_if<Kokkos::is_view<HostViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Threads reduce result must be a View");
-
-    static_assert(
-        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
-        "Kokkos::Threads reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-};
-
-// MDRangePolicy impl
-template <class FunctorType, class ReducerType, class... Traits>
-class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-                     Kokkos::Threads> {
- private:
-  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
-  using Policy        = typename MDRangePolicy::impl_range_policy;
-
-  using WorkTag   = typename MDRangePolicy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using value_type     = typename ValueTraits::value_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  using iterate_type =
-      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
-                                             WorkTag, reference_type>;
-
-  const FunctorType m_functor;
-  const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
-                          // ).set_chunk_size(1) in ctor
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-
-  inline static void exec_range(const MDRangePolicy &mdr_policy,
-                                const FunctorType &functor, const Member &ibeg,
-                                const Member &iend, reference_type update) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      iterate_type(mdr_policy, functor, update)(i);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Static>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    ParallelReduce::exec_range(
-        self.m_mdr_policy, self.m_functor, range.begin(), range.end(),
-        ValueInit::init(
-            ReducerConditional::select(self.m_functor, self.m_reducer),
-            exec.reduce_memory()));
-
-    exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>(
-        ReducerConditional::select(self.m_functor, self.m_reducer));
-  }
-
-  template <class Schedule>
-  static typename std::enable_if<
-      std::is_same<Schedule, Kokkos::Dynamic>::value>::type
-  exec_schedule(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size());
-    exec.reset_steal_target();
-    exec.barrier();
-
-    long work_index       = exec.get_work_index();
-    reference_type update = ValueInit::init(
-        ReducerConditional::select(self.m_functor, self.m_reducer),
-        exec.reduce_memory());
-    while (work_index != -1) {
-      const Member begin =
-          static_cast<Member>(work_index) * self.m_policy.chunk_size();
-      const Member end =
-          begin + self.m_policy.chunk_size() < self.m_policy.end()
-              ? begin + self.m_policy.chunk_size()
-              : self.m_policy.end();
-      ParallelReduce::exec_range(self.m_mdr_policy, self.m_functor, begin, end,
-                                 update);
-      work_index = exec.get_work_index();
-    }
-
-    exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>(
-        ReducerConditional::select(self.m_functor, self.m_reducer));
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(
-        ValueTraits::value_size(
-            ReducerConditional::select(m_functor, m_reducer)),
-        0);
-
-    ThreadsExec::start(&ParallelReduce::exec, this);
-
-    ThreadsExec::fence();
-
-    if (m_result_ptr) {
-      const pointer_type data =
-          (pointer_type)ThreadsExec::root_reduce_scratch();
-
-      const unsigned n = ValueTraits::value_count(
-          ReducerConditional::select(m_functor, m_reducer));
-      for (unsigned i = 0; i < n; ++i) {
-        m_result_ptr[i] = data[i];
-      }
-    }
-  }
-
-  template <class HostViewType>
-  ParallelReduce(
-      const FunctorType &arg_functor, const MDRangePolicy &arg_policy,
-      const HostViewType &arg_result_view,
-      typename std::enable_if<Kokkos::is_view<HostViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    static_assert(Kokkos::is_view<HostViewType>::value,
-                  "Kokkos::Threads reduce result must be a View");
-
-    static_assert(
-        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
-        "Kokkos::Threads reduce result must be a View in HostSpace");
-  }
-
-  inline ParallelReduce(const FunctorType &arg_functor,
-                        MDRangePolicy arg_policy, const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                                    , Kokkos::HostSpace >::value
-      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-      );*/
-  }
-
-  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy &, const Functor &) {
-    /**
-     * 1024 here is just our guess for a reasonable max tile size,
-     * it isn't a hardware constraint. If people see a use for larger
-     * tile size products, we're happy to change this.
-     */
-    return 1024;
-  }
-};
-
-//----------------------------------------------------------------------------
-/* ParallelReduce with Kokkos::Threads and TeamPolicy */
-
-template <class FunctorType, class ReducerType, class... Properties>
-class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
-                     ReducerType, Kokkos::Threads> {
- private:
-  using Policy =
-      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
-  using WorkTag = typename Policy::work_tag;
-  using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
-  using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type;
-
-  using ValueTraits =
-      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit  = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
-  const int m_shared;
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_team(const FunctorType &functor, Member member,
-                reference_type update) {
-    for (; member.valid_static(); member.next_static()) {
-      functor(member, update);
-    }
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_team(const FunctorType &functor, Member member,
-                reference_type update) {
-    const TagType t{};
-    for (; member.valid_static(); member.next_static()) {
-      functor(t, member, update);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelReduce &self = *((const ParallelReduce *)arg);
-
-    ParallelReduce::template exec_team<WorkTag>(
-        self.m_functor, Member(&exec, self.m_policy, self.m_shared),
-        ValueInit::init(
-            ReducerConditional::select(self.m_functor, self.m_reducer),
-            exec.reduce_memory()));
-
-    exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>(
-        ReducerConditional::select(self.m_functor, self.m_reducer));
-  }
-
- public:
-  inline void execute() const {
-    if (m_policy.league_size() * m_policy.team_size() == 0) {
-      if (m_result_ptr) {
-        ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                        m_result_ptr);
-        ValueFinal::final(ReducerConditional::select(m_functor, m_reducer),
-                          m_result_ptr);
-      }
-    } else {
-      ThreadsExec::resize_scratch(
-          ValueTraits::value_size(
-              ReducerConditional::select(m_functor, m_reducer)),
-          Policy::member_type::team_reduce_size() + m_shared);
-
-      ThreadsExec::start(&ParallelReduce::exec, this);
-
-      ThreadsExec::fence();
-
-      if (m_result_ptr) {
-        const pointer_type data =
-            (pointer_type)ThreadsExec::root_reduce_scratch();
-
-        const unsigned n = ValueTraits::value_count(
-            ReducerConditional::select(m_functor, m_reducer));
-        for (unsigned i = 0; i < n; ++i) {
-          m_result_ptr[i] = data[i];
-        }
-      }
-    }
-  }
-
-  template <typename Policy>
-  Policy fix_policy(Policy policy) {
-    if (policy.impl_vector_length() < 0) {
-      policy.impl_set_vector_length(1);
-    }
-    if (policy.team_size() < 0) {
-      policy.impl_set_team_size(policy.team_size_recommended(
-          m_functor, m_reducer, ParallelReduceTag{}));
-    }
-    return policy;
-  }
-
-  template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType &arg_functor, const Policy &arg_policy,
-      const ViewType &arg_result,
-      typename std::enable_if<Kokkos::is_view<ViewType>::value &&
-                                  !Kokkos::is_reducer_type<ReducerType>::value,
-                              void *>::type = nullptr)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_reducer(InvalidType()),
-        m_result_ptr(arg_result.data()),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {}
-
-  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
-                        const ReducerType &reducer)
-      : m_functor(arg_functor),
-        m_policy(fix_policy(arg_policy)),
-        m_reducer(reducer),
-        m_result_ptr(reducer.view().data()),
-        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
-                 FunctorTeamShmemSize<FunctorType>::value(
-                     arg_functor, m_policy.team_size())) {
-    /*static_assert( std::is_same< typename ViewType::memory_space
-                            , Kokkos::HostSpace >::value
-    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
-    );*/
-  }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-/* ParallelScan with Kokkos::Threads and RangePolicy */
-
-template <class FunctorType, class... Traits>
-class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
-                   Kokkos::Threads> {
- private:
-  using Policy      = Kokkos::RangePolicy<Traits...>;
-  using WorkRange   = typename Policy::WorkRange;
-  using WorkTag     = typename Policy::work_tag;
-  using Member      = typename Policy::member_type;
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update, const bool final) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update, const bool final) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelScan &self = *((const ParallelScan *)arg);
-
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    reference_type update =
-        ValueInit::init(self.m_functor, exec.reduce_memory());
-
-    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                               range.end(), update, false);
-
-    //  exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
-    exec.template scan_small<FunctorType, WorkTag>(self.m_functor);
-
-    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
-                                               range.end(), update, true);
-
-    exec.fan_in();
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(2 * ValueTraits::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScan::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
-};
-
-template <class FunctorType, class ReturnType, class... Traits>
-class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-                            ReturnType, Kokkos::Threads> {
- private:
-  using Policy      = Kokkos::RangePolicy<Traits...>;
-  using WorkRange   = typename Policy::WorkRange;
-  using WorkTag     = typename Policy::work_tag;
-  using Member      = typename Policy::member_type;
-  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
-  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
-
-  using pointer_type   = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  const FunctorType m_functor;
-  const Policy m_policy;
-  ReturnType &m_returnvalue;
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update, const bool final) {
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(i, update, final);
-    }
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      exec_range(const FunctorType &functor, const Member &ibeg,
-                 const Member &iend, reference_type update, const bool final) {
-    const TagType t{};
-#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
-    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#pragma ivdep
-#endif
-    for (Member i = ibeg; i < iend; ++i) {
-      functor(t, i, update, final);
-    }
-  }
-
-  static void exec(ThreadsExec &exec, const void *arg) {
-    const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg);
-
-    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
-
-    reference_type update =
-        ValueInit::init(self.m_functor, exec.reduce_memory());
-
-    ParallelScanWithTotal::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(), update, false);
-
-    //  exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
-    exec.template scan_small<FunctorType, WorkTag>(self.m_functor);
-
-    ParallelScanWithTotal::template exec_range<WorkTag>(
-        self.m_functor, range.begin(), range.end(), update, true);
-
-    exec.fan_in();
-
-    if (exec.pool_rank() == exec.pool_size() - 1) {
-      self.m_returnvalue = update;
-    }
-  }
-
- public:
-  inline void execute() const {
-    ThreadsExec::resize_scratch(2 * ValueTraits::value_size(m_functor), 0);
-    ThreadsExec::start(&ParallelScanWithTotal::exec, this);
-    ThreadsExec::fence();
-  }
-
-  ParallelScanWithTotal(const FunctorType &arg_functor,
-                        const Policy &arg_policy, ReturnType &arg_returnvalue)
-      : m_functor(arg_functor),
-        m_policy(arg_policy),
-        m_returnvalue(arg_returnvalue) {}
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp
new file mode 100644
index 000000000..6d1a38da1
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_MDRange.hpp
@@ -0,0 +1,322 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_MDRANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_MDRANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
+                  Kokkos::Threads> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+
+  using WorkTag = typename MDRangePolicy::work_tag;
+
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile<
+      MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>;
+
+  const FunctorType m_functor;
+  const MDRangePolicy m_mdr_policy;
+  const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
+                          // ).set_chunk_size(1) in ctor
+
+  inline static void exec_range(const MDRangePolicy &mdr_policy,
+                                const FunctorType &functor, const Member ibeg,
+                                const Member iend) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      iterate_type(mdr_policy, functor)(i);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, range.begin(),
+                            range.end());
+
+    exec.fan_in();
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+
+      ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, begin, end);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::start(&ParallelFor::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+                     Kokkos::Threads> {
+ private:
+  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
+  using Policy        = typename MDRangePolicy::impl_range_policy;
+
+  using WorkTag   = typename MDRangePolicy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         MDRangePolicy, ReducerTypeFwd>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using value_type     = typename Analysis::value_type;
+  using reference_type = typename Analysis::reference_type;
+
+  using iterate_type =
+      typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
+                                             WorkTag, reference_type>;
+
+  const FunctorType m_functor;
+  const MDRangePolicy m_mdr_policy;
+  const Policy m_policy;  // construct as RangePolicy( 0, num_tiles
+                          // ).set_chunk_size(1) in ctor
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+
+  inline static void exec_range(const MDRangePolicy &mdr_policy,
+                                const FunctorType &functor, const Member &ibeg,
+                                const Member &iend, reference_type update) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      iterate_type(mdr_policy, functor, update)(i);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer reducer(
+        &ReducerConditional::select(self.m_functor, self.m_reducer));
+
+    ParallelReduce::exec_range(
+        self.m_mdr_policy, self.m_functor, range.begin(), range.end(),
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(reducer);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    typename Analysis::Reducer reducer(
+        &ReducerConditional::select(self.m_functor, self.m_reducer));
+
+    reference_type update =
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+      ParallelReduce::exec_range(self.m_mdr_policy, self.m_functor, begin, end,
+                                 update);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in_reduce(reducer);
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(
+        Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)),
+        0);
+
+    ThreadsExec::start(&ParallelReduce::exec, this);
+
+    ThreadsExec::fence();
+
+    if (m_result_ptr) {
+      const pointer_type data =
+          (pointer_type)ThreadsExec::root_reduce_scratch();
+
+      const unsigned n = Analysis::value_count(
+          ReducerConditional::select(m_functor, m_reducer));
+      for (unsigned i = 0; i < n; ++i) {
+        m_result_ptr[i] = data[i];
+      }
+    }
+  }
+
+  template <class HostViewType>
+  ParallelReduce(const FunctorType &arg_functor,
+                 const MDRangePolicy &arg_policy,
+                 const HostViewType &arg_result_view,
+                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void *> = nullptr)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<HostViewType>::value,
+                  "Kokkos::Threads reduce result must be a View");
+
+    static_assert(
+        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
+        "Kokkos::Threads reduce result must be a View in HostSpace");
+  }
+
+  inline ParallelReduce(const FunctorType &arg_functor,
+                        MDRangePolicy arg_policy, const ReducerType &reducer)
+      : m_functor(arg_functor),
+        m_mdr_policy(arg_policy),
+        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                                    , Kokkos::HostSpace >::value
+      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+      );*/
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy &, const Functor &) {
+    /**
+     * 1024 here is just our guess for a reasonable max tile size,
+     * it isn't a hardware constraint. If people see a use for larger
+     * tile size products, we're happy to change this.
+     */
+    return 1024;
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp
new file mode 100644
index 000000000..971a0bb9c
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Range.hpp
@@ -0,0 +1,485 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_RANGE_HPP
+#define KOKKOS_THREADS_PARALLEL_RANGE_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Traits>
+class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
+                  Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member ibeg, const Member iend) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member ibeg, const Member iend) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                              range.end());
+
+    exec.fan_in();
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin() - self.m_policy.begin(),
+                        range.end() - self.m_policy.begin(),
+                        self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
+          self.m_policy.begin();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+      ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::start(&ParallelFor::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class ReducerType, class... Traits>
+class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+                     Kokkos::Threads> {
+ private:
+  using Policy = Kokkos::RangePolicy<Traits...>;
+
+  using WorkTag   = typename Policy::work_tag;
+  using WorkRange = typename Policy::WorkRange;
+  using Member    = typename Policy::member_type;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, ReducerTypeFwd>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer reducer(
+        &ReducerConditional::select(self.m_functor, self.m_reducer));
+
+    ParallelReduce::template exec_range<WorkTag>(
+        self.m_functor, range.begin(), range.end(),
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(reducer);
+  }
+
+  template <class Schedule>
+  static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_schedule(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    exec.set_work_range(range.begin() - self.m_policy.begin(),
+                        range.end() - self.m_policy.begin(),
+                        self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    typename Analysis::Reducer reducer(
+        &ReducerConditional::select(self.m_functor, self.m_reducer));
+
+    reference_type update =
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+    while (work_index != -1) {
+      const Member begin =
+          static_cast<Member>(work_index) * self.m_policy.chunk_size() +
+          self.m_policy.begin();
+      const Member end =
+          begin + self.m_policy.chunk_size() < self.m_policy.end()
+              ? begin + self.m_policy.chunk_size()
+              : self.m_policy.end();
+      ParallelReduce::template exec_range<WorkTag>(self.m_functor, begin, end,
+                                                   update);
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in_reduce(reducer);
+  }
+
+ public:
+  inline void execute() const {
+    if (m_policy.end() <= m_policy.begin()) {
+      if (m_result_ptr) {
+        typename Analysis::Reducer final_reducer(
+            &ReducerConditional::select(m_functor, m_reducer));
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
+      }
+    } else {
+      ThreadsExec::resize_scratch(
+          Analysis::value_size(
+              ReducerConditional::select(m_functor, m_reducer)),
+          0);
+
+      ThreadsExec::start(&ParallelReduce::exec, this);
+
+      ThreadsExec::fence();
+
+      if (m_result_ptr) {
+        const pointer_type data =
+            (pointer_type)ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = Analysis::value_count(
+            ReducerConditional::select(m_functor, m_reducer));
+        for (unsigned i = 0; i < n; ++i) {
+          m_result_ptr[i] = data[i];
+        }
+      }
+    }
+  }
+
+  template <class HostViewType>
+  ParallelReduce(const FunctorType &arg_functor, const Policy &arg_policy,
+                 const HostViewType &arg_result_view,
+                 std::enable_if_t<Kokkos::is_view<HostViewType>::value &&
+                                      !Kokkos::is_reducer<ReducerType>::value,
+                                  void *> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result_view.data()) {
+    static_assert(Kokkos::is_view<HostViewType>::value,
+                  "Kokkos::Threads reduce result must be a View");
+
+    static_assert(
+        std::is_same<typename HostViewType::memory_space, HostSpace>::value,
+        "Kokkos::Threads reduce result must be a View in HostSpace");
+  }
+
+  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
+                        const ReducerType &reducer)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                                    , Kokkos::HostSpace >::value
+      , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+      );*/
+  }
+};
+
+template <class FunctorType, class... Traits>
+class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
+                   Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkRange = typename Policy::WorkRange;
+  using WorkTag   = typename Policy::work_tag;
+  using Member    = typename Policy::member_type;
+  using Analysis  = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update, final);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelScan &self = *((const ParallelScan *)arg);
+
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer final_reducer(&self.m_functor);
+
+    reference_type update =
+        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+
+    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                               range.end(), update, false);
+
+    //  exec.template scan_large( final_reducer );
+    exec.scan_small(final_reducer);
+
+    ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(),
+                                               range.end(), update, true);
+
+    exec.fan_in();
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsExec::start(&ParallelScan::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor), m_policy(arg_policy) {}
+};
+
+template <class FunctorType, class ReturnType, class... Traits>
+class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+                            ReturnType, Kokkos::Threads> {
+ private:
+  using Policy    = Kokkos::RangePolicy<Traits...>;
+  using WorkRange = typename Policy::WorkRange;
+  using WorkTag   = typename Policy::work_tag;
+  using Member    = typename Policy::member_type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
+                                         Policy, FunctorType>;
+
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  ReturnType &m_returnvalue;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(i, update, final);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range(
+      const FunctorType &functor, const Member &ibeg, const Member &iend,
+      reference_type update, const bool final) {
+    const TagType t{};
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \
+    defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#pragma ivdep
+#endif
+    for (Member i = ibeg; i < iend; ++i) {
+      functor(t, i, update, final);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg);
+
+    const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size());
+
+    typename Analysis::Reducer final_reducer(&self.m_functor);
+
+    reference_type update =
+        final_reducer.init(static_cast<pointer_type>(exec.reduce_memory()));
+
+    ParallelScanWithTotal::template exec_range<WorkTag>(
+        self.m_functor, range.begin(), range.end(), update, false);
+
+    //  exec.template scan_large(final_reducer);
+    exec.scan_small(final_reducer);
+
+    ParallelScanWithTotal::template exec_range<WorkTag>(
+        self.m_functor, range.begin(), range.end(), update, true);
+
+    exec.fan_in();
+
+    if (exec.pool_rank() == exec.pool_size() - 1) {
+      self.m_returnvalue = update;
+    }
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0);
+    ThreadsExec::start(&ParallelScanWithTotal::exec, this);
+    ThreadsExec::fence();
+  }
+
+  ParallelScanWithTotal(const FunctorType &arg_functor,
+                        const Policy &arg_policy, ReturnType &arg_returnvalue)
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_returnvalue(arg_returnvalue) {}
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp
new file mode 100644
index 000000000..bdda110d3
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel_Team.hpp
@@ -0,0 +1,279 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_TEAM_HPP
+#define KOKKOS_THREADS_PARALLEL_TEAM_HPP
+
+#include <Kokkos_Parallel.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template <class FunctorType, class... Properties>
+class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                  Kokkos::Threads> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const size_t m_shared;
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Static>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    for (; member.valid_static(); member.next_static()) {
+      functor(member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<!std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Static>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    const TagType t{};
+    for (; member.valid_static(); member.next_static()) {
+      functor(t, member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    for (; member.valid_dynamic(); member.next_dynamic()) {
+      functor(member);
+    }
+  }
+
+  template <class TagType, class Schedule>
+  inline static std::enable_if_t<!std::is_void<TagType>::value &&
+                                 std::is_same<Schedule, Kokkos::Dynamic>::value>
+  exec_team(const FunctorType &functor, Member member) {
+    const TagType t{};
+    for (; member.valid_dynamic(); member.next_dynamic()) {
+      functor(t, member);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelFor &self = *((const ParallelFor *)arg);
+
+    ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>(
+        self.m_functor, Member(&exec, self.m_policy, self.m_shared));
+
+    exec.barrier();
+    exec.fan_in();
+  }
+  template <typename Policy>
+  Policy fix_policy(Policy policy) {
+    if (policy.impl_vector_length() < 0) {
+      policy.impl_set_vector_length(1);
+    }
+    if (policy.team_size() < 0) {
+      policy.impl_set_team_size(
+          policy.team_size_recommended(m_functor, ParallelForTag{}));
+    }
+    return policy;
+  }
+
+ public:
+  inline void execute() const {
+    ThreadsExec::resize_scratch(
+        0, Policy::member_type::team_reduce_size() + m_shared);
+
+    ThreadsExec::start(&ParallelFor::exec, this);
+
+    ThreadsExec::fence();
+  }
+
+  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
+      : m_functor(arg_functor),
+        m_policy(fix_policy(arg_policy)),
+        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor, m_policy.team_size())) {}
+};
+
+template <class FunctorType, class ReducerType, class... Properties>
+class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                     ReducerType, Kokkos::Threads> {
+ private:
+  using Policy =
+      Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>;
+  using WorkTag = typename Policy::work_tag;
+  using Member  = typename Policy::member_type;
+
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
+      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                                  WorkTag, void>::type;
+
+  using Analysis = Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                                         Policy, ReducerTypeFwd>;
+  using pointer_type   = typename Analysis::pointer_type;
+  using reference_type = typename Analysis::reference_type;
+
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const size_t m_shared;
+
+  template <class TagType>
+  inline static std::enable_if_t<std::is_void<TagType>::value> exec_team(
+      const FunctorType &functor, Member member, reference_type update) {
+    for (; member.valid_static(); member.next_static()) {
+      functor(member, update);
+    }
+  }
+
+  template <class TagType>
+  inline static std::enable_if_t<!std::is_void<TagType>::value> exec_team(
+      const FunctorType &functor, Member member, reference_type update) {
+    const TagType t{};
+    for (; member.valid_static(); member.next_static()) {
+      functor(t, member, update);
+    }
+  }
+
+  static void exec(ThreadsExec &exec, const void *arg) {
+    const ParallelReduce &self = *((const ParallelReduce *)arg);
+
+    typename Analysis::Reducer reducer(
+        &ReducerConditional::select(self.m_functor, self.m_reducer));
+
+    ParallelReduce::template exec_team<WorkTag>(
+        self.m_functor, Member(&exec, self.m_policy, self.m_shared),
+        reducer.init(static_cast<pointer_type>(exec.reduce_memory())));
+
+    exec.fan_in_reduce(reducer);
+  }
+
+ public:
+  inline void execute() const {
+    if (m_policy.league_size() * m_policy.team_size() == 0) {
+      if (m_result_ptr) {
+        typename Analysis::Reducer final_reducer(
+            &ReducerConditional::select(m_functor, m_reducer));
+        final_reducer.init(m_result_ptr);
+        final_reducer.final(m_result_ptr);
+      }
+    } else {
+      ThreadsExec::resize_scratch(
+          Analysis::value_size(
+              ReducerConditional::select(m_functor, m_reducer)),
+          Policy::member_type::team_reduce_size() + m_shared);
+
+      ThreadsExec::start(&ParallelReduce::exec, this);
+
+      ThreadsExec::fence();
+
+      if (m_result_ptr) {
+        const pointer_type data =
+            (pointer_type)ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = Analysis::value_count(
+            ReducerConditional::select(m_functor, m_reducer));
+        for (unsigned i = 0; i < n; ++i) {
+          m_result_ptr[i] = data[i];
+        }
+      }
+    }
+  }
+
+  template <typename Policy>
+  Policy fix_policy(Policy policy) {
+    if (policy.impl_vector_length() < 0) {
+      policy.impl_set_vector_length(1);
+    }
+    if (policy.team_size() < 0) {
+      policy.impl_set_team_size(policy.team_size_recommended(
+          m_functor, m_reducer, ParallelReduceTag{}));
+    }
+    return policy;
+  }
+
+  template <class ViewType>
+  inline ParallelReduce(
+      const FunctorType &arg_functor, const Policy &arg_policy,
+      const ViewType &arg_result,
+      std::enable_if_t<Kokkos::is_view<ViewType>::value &&
+                           !Kokkos::is_reducer<ReducerType>::value,
+                       void *> = nullptr)
+      : m_functor(arg_functor),
+        m_policy(fix_policy(arg_policy)),
+        m_reducer(InvalidType()),
+        m_result_ptr(arg_result.data()),
+        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor, m_policy.team_size())) {}
+
+  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
+                        const ReducerType &reducer)
+      : m_functor(arg_functor),
+        m_policy(fix_policy(arg_policy)),
+        m_reducer(reducer),
+        m_result_ptr(reducer.view().data()),
+        m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) +
+                 FunctorTeamShmemSize<FunctorType>::value(
+                     arg_functor, m_policy.team_size())) {
+    /*static_assert( std::is_same< typename ViewType::memory_space
+                            , Kokkos::HostSpace >::value
+    , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace"
+    );*/
+  }
+};
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_UniqueToken.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_UniqueToken.hpp
new file mode 100644
index 000000000..f9901198f
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_UniqueToken.hpp
@@ -0,0 +1,157 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_UNIQUETOKEN_HPP
+#define KOKKOS_THREADS_UNIQUETOKEN_HPP
+
+#include <Kokkos_UniqueToken.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+template <>
+class UniqueToken<Threads, UniqueTokenScope::Instance> {
+ private:
+  using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>;
+  int m_count;
+  buffer_type m_buffer_view;
+  uint32_t volatile *m_buffer;
+
+ public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const & = execution_space()) noexcept
+      : m_count(::Kokkos::Threads::impl_thread_pool_size()),
+        m_buffer_view(buffer_type()),
+        m_buffer(nullptr) {}
+
+  UniqueToken(size_type max_size, execution_space const & = execution_space())
+      : m_count(max_size > ::Kokkos::Threads::impl_thread_pool_size()
+                    ? ::Kokkos::Threads::impl_thread_pool_size()
+                    : max_size),
+        m_buffer_view(
+            max_size > ::Kokkos::Threads::impl_thread_pool_size()
+                ? buffer_type()
+                : buffer_type("UniqueToken::m_buffer_view",
+                              ::Kokkos::Impl::concurrent_bitset::buffer_bound(
+                                  m_count))),
+        m_buffer(m_buffer_view.data()) {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept { return m_count; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept {
+    KOKKOS_IF_ON_HOST((
+        if (m_buffer == nullptr) {
+          return Threads::impl_thread_pool_rank();
+        } else {
+          const ::Kokkos::pair<int, int> result =
+              ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
+                  m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
+
+          if (result.first < 0) {
+            ::Kokkos::abort(
+                "UniqueToken<Threads> failure to acquire tokens, no tokens "
+                "available");
+          }
+          return result.first;
+        }))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int i) const noexcept {
+    KOKKOS_IF_ON_HOST((if (m_buffer != nullptr) {
+      ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
+    }))
+
+    KOKKOS_IF_ON_DEVICE(((void)i;))
+  }
+};
+
+template <>
+class UniqueToken<Threads, UniqueTokenScope::Global> {
+ public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken(execution_space const & = execution_space()) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept {
+    KOKKOS_IF_ON_HOST((return Threads::impl_thread_pool_size();))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept {
+    KOKKOS_IF_ON_HOST((return Threads::impl_thread_pool_rank();))
+
+    KOKKOS_IF_ON_DEVICE((return 0;))
+  }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release(int) const noexcept {}
+};
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
index 401f3c0b1..5e8ac4604 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@@ -64,13 +64,13 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
   FunctorType m_functor;
 
   template <class TagType>
-  typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     m_functor(w);
   }
 
   template <class TagType>
-  typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one(
+  std::enable_if_t<!std::is_void<TagType>::value> exec_one(
       const std::int32_t w) const noexcept {
     const TagType t{};
     m_functor(t, w);
diff --git a/packages/kokkos/core/src/View/Hooks/Kokkos_ViewHooks.hpp b/packages/kokkos/core/src/View/Hooks/Kokkos_ViewHooks.hpp
new file mode 100644
index 000000000..77b2730b1
--- /dev/null
+++ b/packages/kokkos/core/src/View/Hooks/Kokkos_ViewHooks.hpp
@@ -0,0 +1,151 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEWHOOKS_HPP
+#define KOKKOS_EXPERIMENTAL_VIEWHOOKS_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace Impl {
+template <typename View>
+using copy_subscription_function_type = void (*)(View &, const View &);
+
+template <template <typename> class Invoker, typename... Subscribers>
+struct invoke_subscriber_impl;
+
+template <template <typename> class Invoker>
+struct invoke_subscriber_impl<Invoker> {
+  template <typename ViewType>
+  static void invoke(ViewType &, const ViewType &) {}
+};
+
+template <template <typename> class Invoker, typename Subscriber,
+          typename... RemSubscribers>
+struct invoke_subscriber_impl<Invoker, Subscriber, RemSubscribers...> {
+  template <typename ViewType>
+  static void invoke(ViewType &self, const ViewType &other) {
+    Invoker<Subscriber>::call(self, other);
+    invoke_subscriber_impl<Invoker, RemSubscribers...>::invoke(self, other);
+  }
+};
+
+template <typename Subscriber>
+struct copy_constructor_invoker {
+  template <typename View>
+  static void call(View &self, const View &other) {
+    Subscriber::copy_constructed(self, other);
+  }
+};
+
+template <typename Subscriber>
+struct move_constructor_invoker {
+  template <typename View>
+  static void call(View &self, const View &other) {
+    Subscriber::move_constructed(self, other);
+  }
+};
+
+template <typename Subscriber>
+struct copy_assignment_operator_invoker {
+  template <typename View>
+  static void call(View &self, const View &other) {
+    Subscriber::copy_assigned(self, other);
+  }
+};
+
+template <typename Subscriber>
+struct move_assignment_operator_invoker {
+  template <typename View>
+  static void call(View &self, const View &other) {
+    Subscriber::move_assigned(self, other);
+  }
+};
+}  // namespace Impl
+
+struct EmptyViewHooks {
+  using hooks_policy = EmptyViewHooks;
+
+  template <typename View>
+  static void copy_construct(View &, const View &) {}
+  template <typename View>
+  static void copy_assign(View &, const View &) {}
+  template <typename View>
+  static void move_construct(View &, const View &) {}
+  template <typename View>
+  static void move_assign(View &, const View &) {}
+};
+
+template <class... Subscribers>
+struct SubscribableViewHooks {
+  using hooks_policy = SubscribableViewHooks<Subscribers...>;
+
+  template <typename View>
+  static void copy_construct(View &self, const View &other) {
+    Impl::invoke_subscriber_impl<Impl::copy_constructor_invoker,
+                                 Subscribers...>::invoke(self, other);
+  }
+  template <typename View>
+  static void copy_assign(View &self, const View &other) {
+    Impl::invoke_subscriber_impl<Impl::copy_assignment_operator_invoker,
+                                 Subscribers...>::invoke(self, other);
+  }
+  template <typename View>
+  static void move_construct(View &self, const View &other) {
+    Impl::invoke_subscriber_impl<Impl::move_constructor_invoker,
+                                 Subscribers...>::invoke(self, other);
+  }
+  template <typename View>
+  static void move_assign(View &self, const View &other) {
+    Impl::invoke_subscriber_impl<Impl::move_assignment_operator_invoker,
+                                 Subscribers...>::invoke(self, other);
+  }
+};
+
+using DefaultViewHooks = EmptyViewHooks;
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif  // KOKKOS_EXPERIMENTAL_VIEWHOOKS_HPP
diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
index 2a06cb65e..aedb8d035 100644
--- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp
@@ -49,12 +49,13 @@
 #include <Kokkos_Cuda.hpp>
 #include <Cuda/Kokkos_Cuda_Half_Impl_Type.hpp>
 #include <Cuda/Kokkos_Cuda_Half_Conversion.hpp>
-#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel_MDRange.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel_Range.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel_Team.hpp>
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
 #include <Cuda/Kokkos_Cuda_View.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
-#include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
 #include <Cuda/Kokkos_Cuda_MDRangePolicy.hpp>
 #include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingSequenceOperations.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
similarity index 89%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingSequenceOperations.hpp
rename to packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
index dacb82bfc..5c09b7a3b 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ModifyingSequenceOperations.hpp
+++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENACC.hpp
@@ -42,10 +42,13 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_MOD_SEQ_OPS_INC_ALL_HPP
-#define KOKKOS_STD_MOD_SEQ_OPS_INC_ALL_HPP
+#ifndef KOKKOS_DECLARE_OPENACC_HPP
+#define KOKKOS_DECLARE_OPENACC_HPP
 
-#include "./modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet1.hpp"
-#include "./modifying_sequence_ops/Kokkos_ModifyingSequenceOperationsSet2.hpp"
+#if defined(KOKKOS_ENABLE_OPENACC)
+#include <OpenACC/Kokkos_OpenACC.hpp>
+#include <OpenACC/Kokkos_OpenACCSpace.hpp>
+#include <OpenACC/Kokkos_OpenACC_Traits.hpp>
+#endif
 
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/CUDA.hpp b/packages/kokkos/core/src/desul/atomics/CUDA.hpp
deleted file mode 100644
index be308a232..000000000
--- a/packages/kokkos/core/src/desul/atomics/CUDA.hpp
+++ /dev/null
@@ -1,541 +0,0 @@
-/* 
-Copyright (c) 2019, Lawrence Livermore National Security, LLC
-and DESUL project contributors. See the COPYRIGHT file for details.
-Source: https://github.com/desul/desul
-
-SPDX-License-Identifier: (BSD-3-Clause)
-*/
-#ifndef DESUL_ATOMICS_CUDA_HPP_
-#define DESUL_ATOMICS_CUDA_HPP_
-
-#ifdef DESUL_HAVE_CUDA_ATOMICS
-// When building with Clang we need to include the device functions always since Clang
-// must see a consistent overload set in both device and host compilation, but that
-// means we need to know on the host what to make visible, i.e. we need a host side
-// compile knowledge of architecture.
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) || \
-    (!defined(__NVCC__) && !defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA))
-#define DESUL_HAVE_CUDA_ATOMICS_ASM
-#include <desul/atomics/cuda/CUDA_asm.hpp>
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)) || \
-    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
-namespace desul {
-namespace Impl {
-template<class T>
-struct is_cuda_atomic_integer_type {
-  static constexpr bool value = std::is_same<T,int>::value ||
-                                std::is_same<T,unsigned int>::value ||
-                                std::is_same<T,unsigned long long int>::value;
-};
-
-template<class T>
-struct is_cuda_atomic_add_type {
-  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-                                std::is_same<T,double>::value || 
-#endif
-                                std::is_same<T,float>::value;
-};
-
-template<class T>
-struct is_cuda_atomic_sub_type {
-  static constexpr bool value = std::is_same<T,int>::value ||
-                                std::is_same<T,unsigned int>::value;
-};
-} // Impl
-
-// Atomic Add
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
-atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicAdd(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
-atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicAdd(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
-atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_add(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-
-// Atomic Sub 
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
-atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicSub(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
-atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicSub(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
-atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_sub(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-// Wrap around atomic add
-__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrderRelaxed,
-                                                    MemoryScopeDevice) {
-  return atomicInc(dest, val);
-}
-
-template <typename MemoryOrder>
-__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrder,
-                                                    MemoryScopeDevice) {
-  __threadfence();
-  unsigned int return_val = atomicInc(dest, val);
-  __threadfence();
-  return return_val;
-}
-
-template <typename MemoryOrder>
-__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrder,
-                                                    MemoryScopeCore) {
-  return atomic_fetch_inc_mod(dest, val, MemoryOrder(), MemoryScopeDevice());
-}
-
-// Wrap around atomic sub
-__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrderRelaxed,
-                                                    MemoryScopeDevice) {
-  return atomicDec(dest, val);
-}
-
-template <typename MemoryOrder>
-__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrder,
-                                                    MemoryScopeDevice) {
-  __threadfence();
-  unsigned int return_val = atomicDec(dest, val);
-  __threadfence();
-  return return_val;
-}
-
-template <typename MemoryOrder>
-__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
-                                                    unsigned int val,
-                                                    MemoryOrder,
-                                                    MemoryScopeCore) {
-  return atomic_fetch_dec_mod(dest, val, MemoryOrder(), MemoryScopeDevice());
-}
-
-// Atomic Inc
-template <typename T>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value, T>::type
-    atomic_fetch_inc(T* dest, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicAdd(dest, T(1));
-}
-
-template <typename T, typename MemoryOrder>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value, T>::type
-    atomic_fetch_inc(T* dest, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicAdd(dest, T(1));
-  __threadfence();
-
-  return return_val;
-}
-
-template <typename T, typename MemoryOrder>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value, T>::type
-    atomic_fetch_inc(T* dest, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_add(dest, T(1), MemoryOrder(), MemoryScopeDevice());
-}
-
-// Atomic Dec
-template <typename T>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value, T>::type
-    atomic_fetch_dec(T* dest, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicSub(dest, T(1));
-}
-
-template <typename T, typename MemoryOrder>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value, T>::type
-    atomic_fetch_dec(T* dest, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicSub(dest, T(1));
-  __threadfence();
-  return return_val;
-}
-
-template <typename T, typename MemoryOrder>
-__device__ inline
-    typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value, T>::type
-    atomic_fetch_dec(T* dest, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_sub(dest, T(1), MemoryOrder(), MemoryScopeDevice());
-}
-
-// Atomic Max
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicMax(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicMax(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_max(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-// Atomic Min
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicMin(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicMin(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_min(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-// Atomic And
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicAnd(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicAnd(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_and(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-// Atomic XOR
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicXor(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicXor(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_xor(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-
-// Atomic OR
-template<class T>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
-  return atomicOr(dest,val);
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
-  __threadfence();
-  T return_val = atomicOr(dest,val);
-  __threadfence();
-  return return_val;
-}
-
-template<class T, class MemoryOrder>
-__device__ inline
-typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
-atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
-  return atomic_fetch_or(dest,val,MemoryOrder(),MemoryScopeDevice());
-}
-} // desul
-#endif
-
-#if !defined(__NVCC__)
-// Functions defined as device functions in CUDA which don't exist in the GCC overload set
-namespace desul {
-
-#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE,ORDER,SCOPE) \
-    inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-    (void) atomic_fetch_add(dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE,ORDER,SCOPE) \
-    inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-    (void) atomic_fetch_sub(dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE,ORDER,SCOPE) \
-    inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
-    (void) atomic_fetch_inc(dest, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE,ORDER,SCOPE) \
-    inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
-    (void) atomic_fetch_dec(dest, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-
-#endif // DESUL_HAVE_CUDA_ATOMICS_ASM
-
-#define DESUL_IMPL_CUDA_HOST_ATOMIC_INC_MOD(TYPE,ORDER,SCOPE) \
-  inline TYPE atomic_fetch_inc_mod(TYPE* dest, TYPE val, ORDER order, SCOPE scope) { \
-  using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(TYPE)>::type; \
-  cas_t oldval = reinterpret_cast<cas_t&>(*dest); \
-  cas_t assume = oldval; \
-  do { \
-    assume = oldval; \
-    TYPE newval = (reinterpret_cast<TYPE&>(assume) >= val) ? static_cast<TYPE>(0) : reinterpret_cast<TYPE&>(assume) + static_cast<TYPE>(1); \
-    oldval = desul::atomic_compare_exchange(reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope); \
-  } while (assume != oldval); \
-  return reinterpret_cast<TYPE&>(oldval); \
-}
-DESUL_IMPL_CUDA_HOST_ATOMIC_INC_MOD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-#define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC_MOD(TYPE,ORDER,SCOPE) \
-    inline TYPE atomic_fetch_dec_mod(TYPE* dest, TYPE val, ORDER order, SCOPE scope) { \
-    using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(TYPE)>::type; \
-    cas_t oldval = reinterpret_cast<cas_t&>(*dest); \
-    cas_t assume = oldval; \
-    do { \
-      assume = oldval; \
-      TYPE newval = ((reinterpret_cast<TYPE&>(assume) == static_cast<TYPE>(0)) | (reinterpret_cast<TYPE&>(assume) > val)) ? val : reinterpret_cast<TYPE&>(assume) - static_cast<TYPE>(1); \
-      oldval = desul::atomic_compare_exchange(reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope); \
-    } while (assume != oldval); \
-    return reinterpret_cast<TYPE&>(oldval); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC_MOD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE,ORDER,SCOPE) \
-    inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-      return Impl::atomic_fetch_oper(Impl::AddOper<TYPE, const TYPE>(),dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE,ORDER,SCOPE) \
-    inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-      return Impl::atomic_fetch_oper(Impl::SubOper<TYPE, const TYPE>(),dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
-
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE,ORDER,SCOPE) \
-    inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-      return Impl::atomic_fetch_oper(Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
-//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
-
-  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE,ORDER,SCOPE) \
-    inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
-      return Impl::atomic_fetch_oper(Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope); \
-  }
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
-  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
-//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
-//  inline void atomic_fetch_max(int32_t* const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-
-}  // namespace desul
-
-// Functions defined int the GCC overload set but not in the device overload set
-namespace desul {
-  __device__ inline
-  unsigned long long atomic_fetch_add(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_fetch_add(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_add(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::AddOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_fetch_sub(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_sub(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::SubOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_max(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::MaxOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_min(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::MinOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_or(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::OrOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_fetch_or(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_xor(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::XorOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_fetch_xor(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_fetch_and(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::AndOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_fetch_and(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_fetch_oper(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
-  }
-
-
-  __device__ inline
-  unsigned long long atomic_add_fetch(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_add_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_add_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::AddOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_sub_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_sub_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::SubOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_or_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_or_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::OrOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_xor_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_xor_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::XorOper<long, const long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long long atomic_and_fetch(long long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
-  }
-  __device__ inline
-  long atomic_and_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
-    return Impl::atomic_oper_fetch(Impl::AndOper<long, const long>(), dest, val, order, scope);
-  }
-}  // namespace desul
-#endif
-
-#endif  // DESUL_HAVE_CUDA_ATOMICS
-#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
deleted file mode 100644
index 14e0ab4cf..000000000
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-Copyright (c) 2019, Lawrence Livermore National Security, LLC
-and DESUL project contributors. See the COPYRIGHT file for details.
-Source: https://github.com/desul/desul
-
-SPDX-License-Identifier: (BSD-3-Clause)
-*/
-
-#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
-#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
-
-// clang-format off
-#include "desul/atomics/SYCLConversions.hpp"
-#include "desul/atomics/Common.hpp"
-
-#include <CL/sycl.hpp>
-// clang-format on
-
-#ifdef DESUL_HAVE_SYCL_ATOMICS
-
-namespace desul {
-
-template <class MemoryOrder, class MemoryScope>
-inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
-  sycl::atomic_fence(
-      Impl::DesulToSYCLMemoryOrder<MemoryOrder, /*extended namespace*/ false>::value,
-      Impl::DesulToSYCLMemoryScope<MemoryScope, /*extended namespace*/ false>::value);
-}
-
-// FIXME_SYCL We need to either use generic_space or figure out how to check for the
-// correct adress space in a SYCL-portable way.
-#ifndef __NVPTX__
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
-    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4,
-                "this function assumes an unsigned int is 32-bit");
-  auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<unsigned int>(dest);
-  if (l) {
-    Impl::sycl_atomic_ref<unsigned int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::local_space>
-    dest_ref(*reinterpret_cast<unsigned int*>(dest));
-    dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare),
-                                     *reinterpret_cast<unsigned int*>(&value));
-    return compare;
-  } else {
-    Impl::sycl_atomic_ref<unsigned int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::global_space>
-    dest_ref(*reinterpret_cast<unsigned int*>(dest));
-    dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare),
-                                     *reinterpret_cast<unsigned int*>(&value));
-    return compare;
-  }
-}
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
-    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8,
-                "this function assumes an unsigned long long is 64-bit");
-  auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<unsigned long long int>(dest);
-  if (l) {
-    Impl::sycl_atomic_ref<unsigned long long int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::local_space>
-    dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-    dest_ref.compare_exchange_strong(
-        *reinterpret_cast<unsigned long long int*>(&compare),
-        *reinterpret_cast<unsigned long long int*>(&value));
-    return compare;
-  } else {
-    Impl::sycl_atomic_ref<unsigned long long int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::global_space>
-    dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-    dest_ref.compare_exchange_strong(
-        *reinterpret_cast<unsigned long long int*>(&compare),
-        *reinterpret_cast<unsigned long long int*>(&value));
-    return compare;
-  }
-}
-
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(T* const dest,
-                                                                 T value,
-                                                                 MemoryOrder,
-                                                                 MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4,
-                "this function assumes an unsigned int is 32-bit");
-  auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<unsigned int>(dest);
-  if (l) {
-    Impl::sycl_atomic_ref<unsigned int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::local_space>
-    dest_ref(*reinterpret_cast<unsigned int*>(dest));
-    unsigned int return_val =
-        dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
-    return reinterpret_cast<T&>(return_val);
-  } else {
-    Impl::sycl_atomic_ref<unsigned int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::global_space>
-    dest_ref(*reinterpret_cast<unsigned int*>(dest));
-    unsigned int return_val =
-        dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
-    return reinterpret_cast<T&>(return_val);
-  }
-}
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(T* const dest,
-                                                                 T value,
-                                                                 MemoryOrder,
-                                                                 MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8,
-                "this function assumes an unsigned long long is 64-bit");
-  auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<unsigned long long int>(dest);
-  if (l) {
-    Impl::sycl_atomic_ref<unsigned long long int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::local_space>
-    dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-    unsigned long long int return_val =
-        dest_ref.exchange(*reinterpret_cast<unsigned long long int*>(&value));
-    return reinterpret_cast<T&>(return_val);
-  } else {
-    Impl::sycl_atomic_ref<unsigned long long int,
-                          MemoryOrder,
-                          MemoryScopeDevice,
-                          sycl::access::address_space::global_space>
-    dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-    unsigned long long int return_val =
-        dest_ref.exchange(*reinterpret_cast<unsigned long long int*>(&value));
-    return reinterpret_cast<T&>(return_val);
-  }
-}
-#else
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
-    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4,
-                "this function assumes an unsigned int is 32-bit");
-  Impl::sycl_atomic_ref<unsigned int,
-                        MemoryOrder,
-                        MemoryScope,
-                        sycl::access::address_space::global_space>
-  dest_ref(*reinterpret_cast<unsigned int*>(dest));
-  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare),
-                                   *reinterpret_cast<unsigned int*>(&value));
-  return compare;
-}
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
-    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8,
-                "this function assumes an unsigned long long is 64-bit");
-  Impl::sycl_atomic_ref<unsigned long long int,
-                        MemoryOrder,
-                        MemoryScope,
-                        sycl::access::address_space::global_space>
-  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
-                                   *reinterpret_cast<unsigned long long int*>(&value));
-  return compare;
-}
-
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(T* const dest,
-                                                                 T value,
-                                                                 MemoryOrder,
-                                                                 MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4,
-                "this function assumes an unsigned int is 32-bit");
-  Impl::sycl_atomic_ref<unsigned int,
-                        MemoryOrder,
-                        MemoryScope,
-                        sycl::access::address_space::global_space>
-  dest_ref(*reinterpret_cast<unsigned int*>(dest));
-  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
-  return reinterpret_cast<T&>(return_val);
-}
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(T* const dest,
-                                                                 T value,
-                                                                 MemoryOrder,
-                                                                 MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8,
-                "this function assumes an unsigned long long is 64-bit");
-  Impl::sycl_atomic_ref<unsigned long long int,
-                        MemoryOrder,
-                        MemoryScope,
-                        sycl::access::address_space::global_space>
-  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
-  unsigned long long int return_val =
-      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
-  return reinterpret_cast<T&>(return_val);
-}
-#endif
-
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
-atomic_compare_exchange(
-    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
-  // FIXME_SYCL not implemented
-  assert(false);
-  return compare;
-}
-
-template <typename T, class MemoryOrder, class MemoryScope>
-typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
-    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
-  // FIXME_SYCL not implemented
-  assert(false);
-  return value;
-}
-
-}  // namespace desul
-
-#endif
-#endif
diff --git a/packages/kokkos/core/src/desul/atomics/SYCL.hpp b/packages/kokkos/core/src/desul/atomics/SYCL.hpp
deleted file mode 100644
index 852559101..000000000
--- a/packages/kokkos/core/src/desul/atomics/SYCL.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-Copyright (c) 2019, Lawrence Livermore National Security, LLC
-and DESUL project contributors. See the COPYRIGHT file for details.
-Source: https://github.com/desul/desul
-
-SPDX-License-Identifier: (BSD-3-Clause)
-*/
-#ifndef DESUL_ATOMICS_SYCL_HPP_
-#define DESUL_ATOMICS_SYCL_HPP_
-
-#ifdef DESUL_HAVE_SYCL_ATOMICS
-
-// clang-format off
-#include "desul/atomics/SYCLConversions.hpp"
-#include "desul/atomics/Common.hpp"
-// clang-format on
-
-namespace desul {
-
-// FIXME_SYCL We need to either use generic_space or figure out how to check for the
-// correct adress space in a SYCL-portable way.
-#ifndef __NVPTX__
-#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, TYPE)                              \
-  template <class MemoryOrder>                                                     \
-  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeDevice) { \
-    auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<TYPE>(dest);                  \
-    if (l) {                                                                       \
-      Impl::sycl_atomic_ref<TYPE,                                                  \
-                            MemoryOrder,                                           \
-                            MemoryScopeDevice,                                     \
-                            sycl::access::address_space::local_space>              \
-          dest_ref(*dest);                                                         \
-      return dest_ref.fetch_##OPER(val);                                           \
-    } else {                                                                       \
-      Impl::sycl_atomic_ref<TYPE,                                                  \
-                            MemoryOrder,                                           \
-                            MemoryScopeDevice,                                     \
-                            sycl::access::address_space::global_space>             \
-          dest_ref(*dest);                                                         \
-      return dest_ref.fetch_##OPER(val);                                           \
-    }                                                                              \
-  }                                                                                \
-  template <class MemoryOrder>                                                     \
-  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeCore) {   \
-    auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<TYPE>(dest);                  \
-    if (l) {                                                                       \
-      Impl::sycl_atomic_ref<TYPE,                                                  \
-                            MemoryOrder,                                           \
-                            MemoryScopeDevice,                                     \
-                            sycl::access::address_space::local_space>              \
-          dest_ref(*dest);                                                         \
-      return dest_ref.fetch_##OPER(val);                                           \
-    } else {                                                                       \
-      Impl::sycl_atomic_ref<TYPE,                                                  \
-                            MemoryOrder,                                           \
-                            MemoryScopeDevice,                                     \
-                            sycl::access::address_space::global_space>             \
-          dest_ref(*dest);                                                         \
-      return dest_ref.fetch_##OPER(val);                                           \
-    }                                                                              \
-  }
-#else
-#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, TYPE)                              \
-  template <class MemoryOrder>                                                     \
-  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeDevice) { \
-    Impl::sycl_atomic_ref<TYPE,                                                    \
-                          MemoryOrder,                                             \
-                          MemoryScopeDevice,                                       \
-                          sycl::access::address_space::global_space>               \
-        dest_ref(*dest);                                                           \
-    return dest_ref.fetch_##OPER(val);                                             \
-  }                                                                                \
-  template <class MemoryOrder>                                                     \
-  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeCore) {   \
-    Impl::sycl_atomic_ref<TYPE,                                                    \
-                          MemoryOrder,                                             \
-                          MemoryScopeCore,                                         \
-                          sycl::access::address_space::global_space>               \
-        dest_ref(*dest);                                                           \
-    return dest_ref.fetch_##OPER(val);                                             \
-  }
-#endif
-
-#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(OPER) \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, int)           \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned int)  \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, long)          \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned long) \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, long long)     \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned long long)
-
-#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(OPER) \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, float)               \
-  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, double)
-
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(add)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(sub)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(and)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(or)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(xor)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(min)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(max)
-
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(add)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(sub)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(min)
-DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(max)
-
-#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT
-#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL
-#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER
-
-}  // namespace desul
-
-#endif  // DESUL_HAVE_SYCL_ATOMICS
-#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
index 1a4e7b482..27540865a 100644
--- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp
@@ -50,6 +50,7 @@ namespace Kokkos {
 namespace Experimental {
 class HIPSpace;            ///< Memory space on HIP GPU
 class HIPHostPinnedSpace;  ///< Memory space on Host accessible to HIP GPU
+class HIPManagedSpace;     ///< Memory migratable between Host and HIP GPU
 class HIP;                 ///< Execution space for HIP GPU
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENACC.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENACC.hpp
new file mode 100644
index 000000000..d733f993d
--- /dev/null
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENACC.hpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENACC_FWD_HPP_
+#define KOKKOS_OPENACC_FWD_HPP_
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+namespace Kokkos {
+namespace Experimental {
+class OpenACC;  ///< OpenACC execution space.
+class OpenACCSpace;
+}  // namespace Experimental
+}  // namespace Kokkos
+#endif
+#endif
diff --git a/packages/kokkos/core/src/impl/CMakeLists.txt b/packages/kokkos/core/src/impl/CMakeLists.txt
index 9ff02a2ea..203fd4a3a 100644
--- a/packages/kokkos/core/src/impl/CMakeLists.txt
+++ b/packages/kokkos/core/src/impl/CMakeLists.txt
@@ -12,7 +12,7 @@ TRIBITS_ADD_LIBRARY(
     DEPLIBS
     )
 
-SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
 
 INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)
 
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 5167c9ed6..2b2120ce4 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1366,9 +1366,8 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> {
 // tagged versions
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    1, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<1, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1385,9 +1384,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    2, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<2, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1404,9 +1402,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    3, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<3, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1423,9 +1420,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    4, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<4, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1442,9 +1438,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    5, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<5, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1461,9 +1456,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    6, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<6, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1480,9 +1474,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    7, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<7, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1499,9 +1492,8 @@ struct Tile_Loop_Type<
 };
 
 template <bool IsLeft, typename IType, typename Tagged>
-struct Tile_Loop_Type<
-    8, IsLeft, IType, Tagged,
-    typename std::enable_if<!std::is_same<Tagged, void>::value>::type> {
+struct Tile_Loop_Type<8, IsLeft, IType, Tagged,
+                      std::enable_if_t<!std::is_void<Tagged>::value>> {
   template <typename Func, typename Offset, typename ExtentA, typename ExtentB>
   static void apply(Func const& func, bool cond, Offset const& offset,
                     ExtentA const& a, ExtentB const& b) {
@@ -1909,25 +1901,22 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
 #endif
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(args...);
   }
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           !std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(m_tag, args...);
   }
 
   RP const& m_rp;
   Functor const& m_func;
-  typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type
-      m_tag;
+  std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag;
 };
 
 // For ParallelReduce
@@ -2329,17 +2318,15 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
 #endif
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(args..., m_v);
   }
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           !std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(m_tag, args..., m_v);
   }
@@ -2347,8 +2334,7 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
   RP const& m_rp;
   Functor const& m_func;
   value_type& m_v;
-  typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type
-      m_tag;
+  std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag;
 };
 
 // For ParallelReduce
@@ -2751,17 +2737,15 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
 #endif
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(args..., m_v);
   }
 
   template <typename... Args>
-  typename std::enable_if<(sizeof...(Args) == RP::rank &&
-                           !std::is_same<Tag, void>::value),
-                          void>::type
+  std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void<Tag>::value),
+                   void>
   apply(Args&&... args) const {
     m_func(m_tag, args..., m_v);
   }
@@ -2769,8 +2753,7 @@ struct HostIterateTile<RP, Functor, Tag, ValueType,
   RP const& m_rp;
   Functor const& m_func;
   value_type* m_v;
-  typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type
-      m_tag;
+  std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag;
 };
 
 // ------------------------------------------------------------------ //
diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
index 688afcc10..957c3b638 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp
@@ -883,9 +883,6 @@ struct DeviceIterateTile<6, PolicyType, Functor, Tag> {
 
 namespace Reduce {
 
-template <typename T>
-using is_void = std::is_same<T, void>;
-
 template <typename T>
 struct is_array_type : std::false_type {
   using value_type = T;
diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index 20fc6268c..2ffcd626d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -205,14 +205,29 @@ struct ExecPolicyTraitsWithDefaults : AnalysisResults {
 };
 
 //------------------------------------------------------------------------------
+
+constexpr bool warn_if_deprecated(std::false_type) { return true; }
+KOKKOS_DEPRECATED_WITH_COMMENT(
+    "Invalid WorkTag template argument in execution policy!!")
+constexpr bool warn_if_deprecated(std::true_type) { return true; }
+#define KOKKOS_IMPL_STATIC_WARNING(...) \
+  static_assert(                        \
+      warn_if_deprecated(std::integral_constant<bool, __VA_ARGS__>()), "")
+
 template <typename... Traits>
 struct PolicyTraits
     : ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>> {
   using base_t =
       ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>>;
   using base_t::base_t;
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  KOKKOS_IMPL_STATIC_WARNING(!std::is_empty<typename base_t::work_tag>::value &&
+                             !std::is_void<typename base_t::work_tag>::value);
+#endif
 };
 
+#undef KOKKOS_IMPL_STATIC_WARNING
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index d481a8dc0..e203c0a2b 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -88,7 +88,7 @@ __inline__ __device__ unsigned long long int atomic_compare_exchange(
 template <typename T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
   const int tmp = atomicCAS((int*)dest, *((int*)&compare), *((int*)&val));
   return *((T*)&tmp);
 }
@@ -96,9 +96,10 @@ __inline__ __device__ T atomic_compare_exchange(
 template <typename T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T&>
+        val) {
   using type     = unsigned long long int;
   const type tmp = atomicCAS((type*)dest, *((type*)&compare), *((type*)&val));
   return *((T*)&tmp);
@@ -107,8 +108,7 @@ __inline__ __device__ T atomic_compare_exchange(
 template <typename T>
 __inline__ __device__ T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8),
-                            const T>::type& val) {
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
@@ -184,7 +184,7 @@ inline unsigned long long atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
   union U {
     int i;
     T t;
@@ -203,9 +203,9 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(long),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
+                     const T&>
+        val) {
   union U {
     long i;
     T t;
@@ -225,10 +225,10 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(long) &&
-                                sizeof(T) == sizeof(Impl::cas128_t),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
+                         sizeof(T) == sizeof(Impl::cas128_t),
+                     const T&>
+        val) {
   union U {
     Impl::cas128_t i;
     T t;
@@ -248,12 +248,12 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T compare,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                && (sizeof(T) != 16)
+                         && (sizeof(T) != 16)
 #endif
-                                ,
-                            const T>::type& val) {
+                         ,
+                     const T>& val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
@@ -375,16 +375,14 @@ KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback(
 template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong(
     T* dest, T compare, T val, MemoryOrderSuccess, MemoryOrderFailure,
-    typename std::enable_if<
+    std::enable_if_t<
         (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 ||
          sizeof(T) == 16) &&
-            std::is_same<
-                typename MemoryOrderSuccess::memory_order,
-                typename std::remove_cv<MemoryOrderSuccess>::type>::value &&
-            std::is_same<
-                typename MemoryOrderFailure::memory_order,
-                typename std::remove_cv<MemoryOrderFailure>::type>::value,
-        void const**>::type = nullptr) {
+            std::is_same<typename MemoryOrderSuccess::memory_order,
+                         std::remove_cv_t<MemoryOrderSuccess>>::value &&
+            std::is_same<typename MemoryOrderFailure::memory_order,
+                         std::remove_cv_t<MemoryOrderFailure>>::value,
+        void const**> = nullptr) {
   return __atomic_compare_exchange_n(dest, &compare, val, /* weak = */ false,
                                      MemoryOrderSuccess::gnu_constant,
                                      MemoryOrderFailure::gnu_constant);
@@ -394,16 +392,14 @@ template <class T, class MemoryOrderSuccess, class MemoryOrderFailure>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong(
     T* dest, T compare, T val, MemoryOrderSuccess order_success,
     MemoryOrderFailure order_failure,
-    typename std::enable_if<
+    std::enable_if_t<
         !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
           sizeof(T) == 8 || sizeof(T) == 16) &&
-            std::is_same<
-                typename MemoryOrderSuccess::memory_order,
-                typename std::remove_cv<MemoryOrderSuccess>::type>::value &&
-            std::is_same<
-                typename MemoryOrderFailure::memory_order,
-                typename std::remove_cv<MemoryOrderFailure>::type>::value,
-        void const**>::type = nullptr) {
+            std::is_same<typename MemoryOrderSuccess::memory_order,
+                         std::remove_cv_t<MemoryOrderSuccess>>::value &&
+            std::is_same<typename MemoryOrderFailure::memory_order,
+                         std::remove_cv_t<MemoryOrderFailure>>::value,
+        void const**> = nullptr) {
   return _atomic_compare_exchange_fallback(dest, compare, val, order_success,
                                            order_failure);
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
index 4bb8b4fd5..ad5b01055 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
@@ -95,7 +95,7 @@ namespace Kokkos {
 #endif
 
 // 32-bit version
-template <class T, typename std::enable_if<sizeof(T) == 4, int>::type = 0>
+template <class T, std::enable_if_t<sizeof(T) == 4, int> = 0>
 __inline__ __device__ bool atomic_compare_exchange_weak(
     T volatile* const dest, T* const expected, T const desired,
     std::memory_order success_order = std::memory_order_seq_cst,
@@ -168,7 +168,7 @@ __inline__ __device__ bool atomic_compare_exchange_weak(
 }
 
 // 64-bit version
-template <class T, typename std::enable_if<sizeof(T) == 8, int>::type = 0>
+template <class T, std::enable_if_t<sizeof(T) == 8, int> = 0>
 bool atomic_compare_exchange_weak(
     T volatile* const dest, T* const expected, T const desired,
     std::memory_order success_order = std::memory_order_seq_cst,
@@ -268,7 +268,7 @@ inline unsigned long long atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
   union U {
     int i;
     T t;
@@ -287,9 +287,9 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(long),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
+                     const T&>
+        val) {
   union U {
     long i;
     T t;
@@ -309,10 +309,10 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(long) &&
-                                sizeof(T) == sizeof(Impl::cas128_t),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
+                         sizeof(T) == sizeof(Impl::cas128_t),
+                     const T&>
+        val) {
   union U {
     Impl::cas128_t i;
     T t;
@@ -332,12 +332,12 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T compare,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                && (sizeof(T) != 16)
+                         && (sizeof(T) != 16)
 #endif
-                                ,
-                            const T>::type& val) {
+                         ,
+                     const T>& val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index cd840983d..a8f77d835 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -78,9 +78,9 @@ __inline__ __device__ unsigned long long int atomic_exchange(
 
 /** \brief  Atomic exchange for any type with compatible size */
 template <typename T>
-__inline__ __device__ T atomic_exchange(
-    volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+__inline__ __device__ T
+atomic_exchange(volatile T* const dest,
+                std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
   // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
@@ -93,9 +93,10 @@ __inline__ __device__ T atomic_exchange(
 template <typename T>
 __inline__ __device__ T atomic_exchange(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T&>
+        val) {
   using type = unsigned long long int;
 
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
@@ -108,10 +109,9 @@ __inline__ __device__ T atomic_exchange(
 }
 
 template <typename T>
-__inline__ __device__ T
-atomic_exchange(volatile T* const dest,
-                typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8),
-                                        const T>::type& val) {
+__inline__ __device__ T atomic_exchange(
+    volatile T* const dest,
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
@@ -141,7 +141,7 @@ atomic_exchange(volatile T* const dest,
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T&> val) {
   // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
   (void)atomicExch(((int*)dest), *((int*)&val));
 }
@@ -149,9 +149,10 @@ __inline__ __device__ void atomic_assign(
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T&>
+        val) {
   using type = unsigned long long int;
   // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
   (void)atomicExch(((type*)dest), *((type*)&val));
@@ -160,9 +161,10 @@ __inline__ __device__ void atomic_assign(
 template <typename T>
 __inline__ __device__ void atomic_assign(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(unsigned long long int),
-                            const T&>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) != sizeof(unsigned long long int),
+                     const T&>
+        val) {
   (void)atomic_exchange(dest, val);
 }
 
@@ -175,10 +177,11 @@ __inline__ __device__ void atomic_assign(
 #if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
 
 template <typename T>
-inline T atomic_exchange(volatile T* const dest,
-                         typename std::enable_if<sizeof(T) == sizeof(int) ||
-                                                     sizeof(T) == sizeof(long),
-                                                 const T&>::type val) {
+inline T atomic_exchange(
+    volatile T* const dest,
+    std::enable_if_t<sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long),
+                     const T&>
+        val) {
   using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
@@ -209,8 +212,7 @@ inline T atomic_exchange(volatile T* const dest,
 template <typename T>
 inline T atomic_exchange(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
@@ -236,14 +238,13 @@ inline T atomic_exchange(
 //----------------------------------------------------------------------------
 
 template <typename T>
-inline T atomic_exchange(
-    volatile T* const dest,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+inline T atomic_exchange(volatile T* const dest,
+                         std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                && (sizeof(T) != 16)
+                                              && (sizeof(T) != 16)
 #endif
-                                ,
-                            const T>::type& val) {
+                                              ,
+                                          const T>& val) {
   while (!Impl::lock_address_host_space((void*)dest))
     ;
   Kokkos::memory_fence();
@@ -268,10 +269,11 @@ inline T atomic_exchange(
 }
 
 template <typename T>
-inline void atomic_assign(volatile T* const dest,
-                          typename std::enable_if<sizeof(T) == sizeof(int) ||
-                                                      sizeof(T) == sizeof(long),
-                                                  const T&>::type val) {
+inline void atomic_assign(
+    volatile T* const dest,
+    std::enable_if_t<sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long),
+                     const T&>
+        val) {
   using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>;
 
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
@@ -301,8 +303,7 @@ inline void atomic_assign(volatile T* const dest,
 template <typename T>
 inline void atomic_assign(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
@@ -323,14 +324,13 @@ inline void atomic_assign(
 #endif
 
 template <typename T>
-inline void atomic_assign(
-    volatile T* const dest,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+inline void atomic_assign(volatile T* const dest,
+                          std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                && (sizeof(T) != 16)
+                                               && (sizeof(T) != 16)
 #endif
-                                ,
-                            const T>::type& val) {
+                                               ,
+                                           const T>& val) {
   while (!Impl::lock_address_host_space((void*)dest))
     ;
   Kokkos::memory_fence();
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 9a2b13deb..c188f4542 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -88,9 +88,9 @@ __inline__ __device__ double atomic_fetch_add(volatile double* const dest,
 #endif
 
 template <typename T>
-__inline__ __device__ T atomic_fetch_add(
-    volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+__inline__ __device__ T
+atomic_fetch_add(volatile T* const dest,
+                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   // to work around a bug in the clang cuda compiler, the name here needs to be
   // different from the one internal to the other overloads
   union U1 {
@@ -113,9 +113,10 @@ __inline__ __device__ T atomic_fetch_add(
 template <typename T>
 __inline__ __device__ T atomic_fetch_add(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T>
+        val) {
   // to work around a bug in the clang cuda compiler, the name here needs to be
   // different from the one internal to the other overloads
   union U2 {
@@ -138,10 +139,9 @@ __inline__ __device__ T atomic_fetch_add(
 //----------------------------------------------------------------------------
 
 template <typename T>
-__inline__ __device__ T
-atomic_fetch_add(volatile T* const dest,
-                 typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8),
-                                         const T>::type& val) {
+__inline__ __device__ T atomic_fetch_add(
+    volatile T* const dest,
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
@@ -236,7 +236,7 @@ inline unsigned long long int atomic_fetch_add(
 template <typename T>
 inline T atomic_fetch_add(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   union U {
     int i;
     T t;
@@ -259,10 +259,11 @@ inline T atomic_fetch_add(
 }
 
 template <typename T>
-inline T atomic_fetch_add(volatile T* const dest,
-                          typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                                      sizeof(T) == sizeof(long),
-                                                  const T>::type val) {
+inline T atomic_fetch_add(
+    volatile T* const dest,
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
+                     const T>
+        val) {
   union U {
     long i;
     T t;
@@ -288,10 +289,10 @@ inline T atomic_fetch_add(volatile T* const dest,
 template <typename T>
 inline T atomic_fetch_add(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) != sizeof(long) &&
-                                sizeof(T) == sizeof(Impl::cas128_t),
-                            const T>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(long) &&
+                         sizeof(T) == sizeof(Impl::cas128_t),
+                     const T>
+        val) {
   union U {
     Impl::cas128_t i;
     T t;
@@ -317,14 +318,13 @@ inline T atomic_fetch_add(
 //----------------------------------------------------------------------------
 
 template <typename T>
-inline T atomic_fetch_add(
-    volatile T* const dest,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+inline T atomic_fetch_add(volatile T* const dest,
+                          std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
-                                && (sizeof(T) != 16)
+                                               && (sizeof(T) != 16)
 #endif
-                                ,
-                            const T>::type& val) {
+                                               ,
+                                           const T>& val) {
   while (!Impl::lock_address_host_space((void*)dest))
     ;
   Kokkos::memory_fence();
@@ -365,8 +365,7 @@ T atomic_fetch_add(volatile T* const dest, const T val) {
 #elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS)
 
 template <typename T>
-T atomic_fetch_add(volatile T* const dest_v,
-                   typename std::add_const<T>::type val) {
+T atomic_fetch_add(volatile T* const dest_v, std::add_const_t<T> val) {
   T* dest  = const_cast<T*>(dest_v);
   T retval = *dest;
   *dest += val;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index 148ed9744..6aaf36970 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -87,9 +87,9 @@ __inline__ __device__ unsigned int atomic_fetch_sub(volatile double* const dest,
 #endif
 
 template <typename T>
-__inline__ __device__ T atomic_fetch_sub(
-    volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+__inline__ __device__ T
+atomic_fetch_sub(volatile T* const dest,
+                 std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   union U {
     int i;
     T t;
@@ -110,9 +110,10 @@ __inline__ __device__ T atomic_fetch_sub(
 template <typename T>
 __inline__ __device__ T atomic_fetch_sub(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T>
+        val) {
   union U {
     unsigned long long int i;
     T t;
@@ -133,10 +134,9 @@ __inline__ __device__ T atomic_fetch_sub(
 //----------------------------------------------------------------------------
 
 template <typename T>
-__inline__ __device__ T
-atomic_fetch_sub(volatile T* const dest,
-                 typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8),
-                                         const T>::type& val) {
+__inline__ __device__ T atomic_fetch_sub(
+    volatile T* const dest,
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
@@ -211,7 +211,7 @@ inline unsigned long long int atomic_fetch_sub(
 template <typename T>
 inline T atomic_fetch_sub(
     volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   union U {
     int i;
     T t;
@@ -234,10 +234,11 @@ inline T atomic_fetch_sub(
 }
 
 template <typename T>
-inline T atomic_fetch_sub(volatile T* const dest,
-                          typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                                      sizeof(T) == sizeof(long),
-                                                  const T>::type val) {
+inline T atomic_fetch_sub(
+    volatile T* const dest,
+    std::enable_if_t<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(long),
+                     const T>
+        val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
@@ -264,8 +265,7 @@ inline T atomic_fetch_sub(volatile T* const dest,
 template <typename T>
 inline T atomic_fetch_sub(
     volatile T* const dest,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8),
-                            const T>::type& val) {
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T>& val) {
 #if defined(KOKKOS_ENABLE_RFO_PREFETCH)
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index f6bdbca72..aac0d12c8 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -188,9 +188,10 @@ struct RShiftOper {
 template <class Oper, typename T>
 KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
     const Oper& op, volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T>
+        val) {
   union U {
     unsigned long long int i;
     T t;
@@ -213,9 +214,10 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
 template <class Oper, typename T>
 KOKKOS_INLINE_FUNCTION T atomic_oper_fetch(
     const Oper& op, volatile T* const dest,
-    typename std::enable_if<sizeof(T) != sizeof(int) &&
-                                sizeof(T) == sizeof(unsigned long long int),
-                            const T>::type val) {
+    std::enable_if_t<sizeof(T) != sizeof(int) &&
+                         sizeof(T) == sizeof(unsigned long long int),
+                     const T>
+        val) {
   union U {
     unsigned long long int i;
     T t;
@@ -236,9 +238,9 @@ KOKKOS_INLINE_FUNCTION T atomic_oper_fetch(
 }
 
 template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
-    const Oper& op, volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+KOKKOS_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op, volatile T* const dest,
+                  std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   union U {
     int i;
     T t;
@@ -258,9 +260,9 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
 }
 
 template <class Oper, typename T>
-KOKKOS_INLINE_FUNCTION T atomic_oper_fetch(
-    const Oper& op, volatile T* const dest,
-    typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) {
+KOKKOS_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op, volatile T* const dest,
+                  std::enable_if_t<sizeof(T) == sizeof(int), const T> val) {
   union U {
     int i;
     T t;
@@ -282,8 +284,7 @@ KOKKOS_INLINE_FUNCTION T atomic_oper_fetch(
 template <class Oper, typename T>
 KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
     const Oper& op, volatile T* const dest,
-    typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), const T>::type
-        val) {
+    std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8), const T> val) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
   while (!Impl::lock_address_host_space((void*)dest))
     ;
@@ -344,13 +345,13 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
 template <class Oper, typename T>
 KOKKOS_INLINE_FUNCTION T
 atomic_oper_fetch(const Oper& op, volatile T* const dest,
-                  typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8)
+                  std::enable_if_t<(sizeof(T) != 4) && (sizeof(T) != 8)
 #if defined(KOKKOS_ENABLE_ASM) && \
     defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-                                              && (sizeof(T) != 16)
+                                       && (sizeof(T) != 16)
 #endif
-                                              ,
-                                          const T>::type& val) {
+                                       ,
+                                   const T>& val) {
 
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
   while (!Impl::lock_address_host_space((void*)dest))
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp
index f3b77a297..f4437326d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp
@@ -72,25 +72,23 @@ namespace Impl {
 template <class T, class MemoryOrder>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load(
     T* ptr, MemoryOrder,
-    typename std::enable_if<
-        (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-         sizeof(T) == 8) &&
-            std::is_same<typename MemoryOrder::memory_order,
-                         typename std::remove_cv<MemoryOrder>::type>::value,
-        void const**>::type = nullptr) {
+    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                      sizeof(T) == 8) &&
+                         std::is_same<typename MemoryOrder::memory_order,
+                                      std::remove_cv_t<MemoryOrder>>::value,
+                     void const**> = nullptr) {
   return __atomic_load_n(ptr, MemoryOrder::gnu_constant);
 }
 
 template <class T, class MemoryOrder>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load(
     T* ptr, MemoryOrder,
-    typename std::enable_if<
-        !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-          sizeof(T) == 8) &&
-            std::is_default_constructible<T>::value &&
-            std::is_same<typename MemoryOrder::memory_order,
-                         typename std::remove_cv<MemoryOrder>::type>::value,
-        void const**>::type = nullptr) {
+    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                       sizeof(T) == 8) &&
+                         std::is_default_constructible<T>::value &&
+                         std::is_same<typename MemoryOrder::memory_order,
+                                      std::remove_cv_t<MemoryOrder>>::value,
+                     void const**> = nullptr) {
   T rv{};
   __atomic_load(ptr, &rv, MemoryOrder::gnu_constant);
   return rv;
@@ -104,9 +102,9 @@ KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load(
 
 template <class T>
 __device__ __inline__ T _relaxed_atomic_load_impl(
-    T* ptr, typename std::enable_if<(sizeof(T) == 1 || sizeof(T) == 2 ||
-                                     sizeof(T) == 4 || sizeof(T) == 8),
-                                    void const**>::type = nullptr) {
+    T* ptr, std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 ||
+                              sizeof(T) == 4 || sizeof(T) == 8),
+                             void const**> = nullptr) {
   return *ptr;
 }
 
@@ -120,9 +118,9 @@ struct NoOpOper {
 
 template <class T>
 __device__ __inline__ T _relaxed_atomic_load_impl(
-    T* ptr, typename std::enable_if<!(sizeof(T) == 1 || sizeof(T) == 2 ||
-                                      sizeof(T) == 4 || sizeof(T) == 8),
-                                    void const**>::type = nullptr) {
+    T* ptr, std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 ||
+                               sizeof(T) == 4 || sizeof(T) == 8),
+                             void const**> = nullptr) {
   T rv{};
   // TODO remove a copy operation here?
   return Kokkos::Impl::atomic_oper_fetch(NoOpOper<T>{}, ptr, rv);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp
index 264d6beaf..ffe018b4d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp
@@ -72,25 +72,23 @@ namespace Impl {
 template <class T, class MemoryOrder>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store(
     T* ptr, T val, MemoryOrder,
-    typename std::enable_if<
-        (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-         sizeof(T) == 8) &&
-            std::is_same<typename MemoryOrder::memory_order,
-                         typename std::remove_cv<MemoryOrder>::type>::value,
-        void const**>::type = nullptr) {
+    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                      sizeof(T) == 8) &&
+                         std::is_same<typename MemoryOrder::memory_order,
+                                      std::remove_cv_t<MemoryOrder>>::value,
+                     void const**> = nullptr) {
   __atomic_store_n(ptr, val, MemoryOrder::gnu_constant);
 }
 
 template <class T, class MemoryOrder>
 KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store(
     T* ptr, T val, MemoryOrder,
-    typename std::enable_if<
-        !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
-          sizeof(T) == 8) &&
-            std::is_default_constructible<T>::value &&
-            std::is_same<typename MemoryOrder::memory_order,
-                         typename std::remove_cv<MemoryOrder>::type>::value,
-        void const**>::type = nullptr) {
+    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                       sizeof(T) == 8) &&
+                         std::is_default_constructible<T>::value &&
+                         std::is_same<typename MemoryOrder::memory_order,
+                                      std::remove_cv_t<MemoryOrder>>::value,
+                     void const**> = nullptr) {
   __atomic_store(ptr, &val, MemoryOrder::gnu_constant);
 }
 
@@ -103,9 +101,9 @@ KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store(
 template <class T>
 __device__ __inline__ void _relaxed_atomic_store_impl(
     T* ptr, T val,
-    typename std::enable_if<(sizeof(T) == 1 || sizeof(T) == 2 ||
-                             sizeof(T) == 4 || sizeof(T) == 8),
-                            void const**>::type = nullptr) {
+    std::enable_if_t<(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                      sizeof(T) == 8),
+                     void const**> = nullptr) {
   *ptr = val;
 }
 
@@ -120,9 +118,9 @@ struct StoreOper {
 template <class T>
 __device__ __inline__ void _relaxed_atomic_store_impl(
     T* ptr, T val,
-    typename std::enable_if<!(sizeof(T) == 1 || sizeof(T) == 2 ||
-                              sizeof(T) == 4 || sizeof(T) == 8),
-                            void const**>::type = nullptr) {
+    std::enable_if_t<!(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
+                       sizeof(T) == 8),
+                     void const**> = nullptr) {
   Kokkos::Impl::atomic_oper_fetch(StoreOper<T>{}, ptr, (T &&) val);
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
index 2f824566b..c5207b51e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@@ -52,8 +52,6 @@
 #include <winsock2.h>
 #include <windows.h>
 
-#undef VOID
-
 namespace Kokkos {
 namespace Impl {
 #ifdef _MSC_VER
@@ -77,7 +75,7 @@ __attribute__((aligned(16)))
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(CHAR), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(CHAR), const T&> val) {
   union U {
     CHAR i;
     T t;
@@ -92,7 +90,7 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(SHORT), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(SHORT), const T&> val) {
   union U {
     SHORT i;
     T t;
@@ -107,7 +105,7 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(LONG), const T&>::type val) {
+    std::enable_if_t<sizeof(T) == sizeof(LONG), const T&> val) {
   union U {
     LONG i;
     T t;
@@ -122,8 +120,7 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(LONGLONG), const T&>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(LONGLONG), const T&> val) {
   union U {
     LONGLONG i;
     T t;
@@ -138,8 +135,7 @@ inline T atomic_compare_exchange(
 template <typename T>
 inline T atomic_compare_exchange(
     volatile T* const dest, const T& compare,
-    typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type
-        val) {
+    std::enable_if_t<sizeof(T) == sizeof(Impl::cas128_t), const T&> val) {
   T compare_and_result(compare);
   union U {
     Impl::cas128_t i;
diff --git a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
index fc58b96a4..a41d19aaf 100644
--- a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -57,93 +57,208 @@ namespace Kokkos {
 namespace Impl {
 
 KOKKOS_FORCEINLINE_FUNCTION
-int int_log2(unsigned i) {
-  enum : int { shift = sizeof(unsigned) * CHAR_BIT - 1 };
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+int int_log2_fallback(unsigned i) {
+  constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
+
+  int offset = 0;
+  if (i) {
+    for (offset = shift; (i & (1 << offset)) == 0; --offset)
+      ;
+  }
+  return offset;
+}
+
+KOKKOS_IMPL_DEVICE_FUNCTION
+inline int int_log2_device(unsigned i) {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+  constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
   return shift - __clz(i);
 #elif defined(KOKKOS_COMPILER_INTEL)
   return _bit_scan_reverse(i);
+#else
+  return int_log2_fallback(i);
+#endif
+}
+
+KOKKOS_IMPL_HOST_FUNCTION
+inline int int_log2_host(unsigned i) {
+// duplicating shift to avoid unused warning in else branch
+#if defined(KOKKOS_COMPILER_INTEL)
+  constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
+  (void)shift;
+  return _bit_scan_reverse(i);
 #elif defined(KOKKOS_COMPILER_CRAYC)
+  constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
   return i ? shift - _leadz32(i) : 0;
 #elif defined(__GNUC__) || defined(__GNUG__)
+  constexpr int shift = sizeof(unsigned) * CHAR_BIT - 1;
   return shift - __builtin_clz(i);
 #else
-  int offset = 0;
-  if (i) {
-    for (offset = shift; (i & (1 << offset)) == 0; --offset)
-      ;
-  }
-  return offset;
+  return int_log2_fallback(i);
 #endif
 }
 
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma push
+#pragma diag_suppress implicit_return_from_non_void_function
+#endif
+KOKKOS_FORCEINLINE_FUNCTION
+int int_log2(unsigned i) {
+  KOKKOS_IF_ON_DEVICE((return int_log2_device(i);))
+  KOKKOS_IF_ON_HOST((return int_log2_host(i);))
+}
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma pop
+#endif
+
 /**\brief  Find first zero bit.
  *
  *  If none then return -1 ;
  */
 KOKKOS_FORCEINLINE_FUNCTION
-int bit_first_zero(unsigned i) noexcept {
-  enum : unsigned { full = ~0u };
+int bit_first_zero_fallback(unsigned i) noexcept {
+  constexpr unsigned full = ~0u;
+
+  int offset = -1;
+  if (full != i) {
+    for (offset = 0; i & (1 << offset); ++offset)
+      ;
+  }
+  return offset;
+}
 
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+KOKKOS_IMPL_DEVICE_FUNCTION
+inline int bit_first_zero_device(unsigned i) noexcept {
+  constexpr unsigned full = ~0u;
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return full != i ? __ffs(~i) - 1 : -1;
 #elif defined(KOKKOS_COMPILER_INTEL)
   return full != i ? _bit_scan_forward(~i) : -1;
+#else
+  (void)full;
+  return bit_first_zero_fallback(i);
+#endif
+}
+
+KOKKOS_IMPL_HOST_FUNCTION
+inline int bit_first_zero_host(unsigned i) noexcept {
+  constexpr unsigned full = ~0u;
+#if defined(KOKKOS_COMPILER_INTEL)
+  return full != i ? _bit_scan_forward(~i) : -1;
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return full != i ? _popcnt(i ^ (i + 1)) - 1 : -1;
 #elif defined(KOKKOS_COMPILER_GNU) || defined(__GNUC__) || defined(__GNUG__)
   return full != i ? __builtin_ffs(~i) - 1 : -1;
 #else
+  (void)full;
+  return bit_first_zero_fallback(i);
+#endif
+}
+
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma push
+#pragma diag_suppress implicit_return_from_non_void_function
+#endif
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_first_zero(unsigned i) noexcept {
+  KOKKOS_IF_ON_DEVICE((return bit_first_zero_device(i);))
+  KOKKOS_IF_ON_HOST((return bit_first_zero_host(i);))
+}
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma pop
+#endif
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward_fallback(unsigned i) {
   int offset = -1;
-  if (full != i) {
-    for (offset = 0; i & (1 << offset); ++offset)
+  if (i) {
+    for (offset = 0; (i & (1 << offset)) == 0; ++offset)
       ;
   }
   return offset;
-#endif
 }
 
-KOKKOS_FORCEINLINE_FUNCTION
-int bit_scan_forward(unsigned i) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_scan_forward_device(unsigned i) {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return __ffs(i) - 1;
 #elif defined(KOKKOS_COMPILER_INTEL)
   return _bit_scan_forward(i);
+#else
+  return bit_scan_forward_fallback(i);
+#endif
+}
+
+KOKKOS_IMPL_HOST_FUNCTION inline int bit_scan_forward_host(unsigned i) {
+#if defined(KOKKOS_COMPILER_INTEL)
+  return _bit_scan_forward(i);
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return i ? _popcnt(~i & (i - 1)) : -1;
 #elif defined(KOKKOS_COMPILER_GNU) || defined(__GNUC__) || defined(__GNUG__)
   return __builtin_ffs(i) - 1;
 #else
-  int offset = -1;
-  if (i) {
-    for (offset = 0; (i & (1 << offset)) == 0; ++offset)
-      ;
-  }
-  return offset;
+  return bit_scan_forward_fallback(i);
 #endif
 }
 
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma push
+#pragma diag_suppress implicit_return_from_non_void_function
+#endif
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward(unsigned i) {
+  KOKKOS_IF_ON_DEVICE((return bit_scan_forward_device(i);))
+  KOKKOS_IF_ON_HOST((return bit_scan_forward_host(i);))
+}
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma pop
+#endif
+
 /// Count the number of bits set.
 KOKKOS_FORCEINLINE_FUNCTION
-int bit_count(unsigned i) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+int bit_count_fallback(unsigned i) {
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ((i >> 1) & ~0u / 3u);                           // temp
+  i = (i & ~0u / 15u * 3u) + ((i >> 2) & ~0u / 15u * 3u);  // temp
+  i = (i + (i >> 4)) & ~0u / 255u * 15u;                   // temp
+
+  // count
+  return (int)((i * (~0u / 255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT);
+}
+
+KOKKOS_IMPL_DEVICE_FUNCTION inline int bit_count_device(unsigned i) {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   return __popc(i);
-#elif defined(__INTEL_COMPILER)
+#elif defined(KOKKOS_COMPILER_INTEL)
+  return _popcnt32(i);
+#else
+  return bit_count_fallback(i);
+#endif
+}
+
+KOKKOS_IMPL_HOST_FUNCTION inline int bit_count_host(unsigned i) {
+#if defined(KOKKOS_COMPILER_INTEL)
   return _popcnt32(i);
 #elif defined(KOKKOS_COMPILER_CRAYC)
   return _popcnt(i);
 #elif defined(__GNUC__) || defined(__GNUG__)
   return __builtin_popcount(i);
 #else
-  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
-  i = i - ((i >> 1) & ~0u / 3u);                           // temp
-  i = (i & ~0u / 15u * 3u) + ((i >> 2) & ~0u / 15u * 3u);  // temp
-  i = (i + (i >> 4)) & ~0u / 255u * 15u;                   // temp
+  return bit_count_fallback(i);
+#endif
+}
 
-  // count
-  return (int)((i * (~0u / 255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT);
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma push
+#pragma diag_suppress implicit_return_from_non_void_function
 #endif
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_count(unsigned i) {
+  KOKKOS_IF_ON_DEVICE((return bit_count_device(i);))
+  KOKKOS_IF_ON_HOST((return bit_count_host(i);))
 }
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma pop
+#endif
 
 KOKKOS_INLINE_FUNCTION
 unsigned integral_power_of_two_that_contains(const unsigned N) {
@@ -152,15 +267,6 @@ unsigned integral_power_of_two_that_contains(const unsigned N) {
 }
 
 }  // namespace Impl
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-
-KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION int log2(unsigned i) {
-  return Impl::int_log2(i);
-}
-
-#endif
-
 }  // namespace Kokkos
 
 #endif  // KOKKOS_BITOPS_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
index 3251cb0f5..a8fc928d1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
index e2283f11f..1a372d8c9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp
@@ -165,8 +165,8 @@ struct ChaseLevDeque {
 
  public:
   template <class _ignore = void,
-            class         = typename std::enable_if<
-                std::is_default_constructible<CircularBufferT>::value>::type>
+            class         = std::enable_if_t<
+                std::is_default_constructible<CircularBufferT>::value>>
   ChaseLevDeque() : m_array() {}
 
   explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {}
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 87f18604d..c1cb6a7d9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -77,20 +77,30 @@ namespace Impl {
  *  having different index-seed values.
  */
 
-KOKKOS_FORCEINLINE_FUNCTION
-uint64_t clock_tic() noexcept {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept {
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
 
   // Return value of 64-bit hi-res clock register.
-
   return clock64();
 
 #elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
+
   return intel_get_cycle_counter();
+
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  return uint64_t(omp_get_wtime() * 1.e9);
-#elif defined(__i386__) || defined(__x86_64)
+
+  return omp_get_wtime() * 1.e9;
+
+#else
+
+  return 0;
+
+#endif
+}
+
+KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
+#if defined(__i386__) || defined(__x86_64)
 
   // Return value of 64-bit hi-res clock register.
 
@@ -111,13 +121,17 @@ uint64_t clock_tic() noexcept {
 
 #else
 
-  return (uint64_t)std::chrono::high_resolution_clock::now()
-      .time_since_epoch()
-      .count();
+  return std::chrono::high_resolution_clock::now().time_since_epoch().count();
 
 #endif
 }
 
+KOKKOS_FORCEINLINE_FUNCTION
+uint64_t clock_tic() noexcept {
+  KOKKOS_IF_ON_DEVICE((return clock_tic_device();))
+  KOKKOS_IF_ON_HOST((return clock_tic_host();))
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 4ec851319..21a202994 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -89,10 +89,6 @@ struct CombinedReducerValueItemImpl {
   constexpr value_type& ref() & noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr value_type const& ref() const& noexcept { return m_value; }
-  KOKKOS_FORCEINLINE_FUNCTION
-  value_type volatile& ref() volatile& noexcept { return m_value; }
-  KOKKOS_FORCEINLINE_FUNCTION
-  value_type const volatile& ref() const volatile& noexcept { return m_value; }
 };
 
 //==============================================================================
@@ -133,15 +129,6 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
   KOKKOS_INLINE_FUNCTION ValueType const& get() const& noexcept {
     return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref();
   }
-  template <size_t Idx, class ValueType>
-  KOKKOS_INLINE_FUNCTION ValueType volatile& get() volatile& noexcept {
-    return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref();
-  }
-  template <size_t Idx, class ValueType>
-  KOKKOS_INLINE_FUNCTION ValueType const volatile& get() const
-      volatile& noexcept {
-    return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref();
-  }
 };
 
 //==============================================================================
@@ -175,12 +162,6 @@ struct CombinedReducerStorageImpl {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
-
-  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
-      value_type volatile& dest, value_type const volatile& src) const {
-    m_reducer.join(dest, src);
-    return _fold_comma_emulation_return{};
-  }
 };
 
 // </editor-fold> end CombinedReducerStorage }}}1
@@ -193,28 +174,20 @@ struct _construct_combined_reducer_from_args_tag {};
 
 template <class T>
 KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg(
-    T&& arg) noexcept ->
-    typename std::enable_if<
-        !is_view<typename std::decay<T>::type>::value &&
-            !is_reducer<typename std::decay<T>::type>::value,
-        typename std::decay<T>::type>::type {
+    T&& arg) noexcept
+    -> std::enable_if_t<!is_view<std::decay_t<T>>::value &&
+                            !is_reducer<std::decay_t<T>>::value,
+                        std::decay_t<T>> {
   return arg;
 }
 
 template <class T>
 KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg(
-    T&& arg) noexcept ->
-    typename std::enable_if<is_view<typename std::decay<T>::type>::value,
-                            typename std::decay<T>::type>::type::value_type {
-  return arg();
-}
-
-template <class T>
-KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg(
-    T&& arg) noexcept ->
-    typename std::enable_if<is_reducer<typename std::decay<T>::type>::value,
-                            typename std::decay<T>::type>::type::value_type {
-  return arg.reference();
+    T&&) noexcept ->
+    typename std::enable_if_t<is_view<std::decay_t<T>>::value ||
+                                  is_reducer<std::decay_t<T>>::value,
+                              std::decay_t<T>>::value_type {
+  return typename std::decay_t<T>::value_type{};
 }
 
 template <class IdxSeq, class Space, class...>
@@ -264,14 +237,6 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
             src.template get<Idxs, typename Reducers::value_type>())...);
   }
 
-  KOKKOS_FUNCTION void join(value_type volatile& dest,
-                            value_type const volatile& src) const noexcept {
-    emulate_fold_comma_operator(
-        this->CombinedReducerStorageImpl<Idxs, Reducers>::_join(
-            dest.template get<Idxs, typename Reducers::value_type>(),
-            src.template get<Idxs, typename Reducers::value_type>())...);
-  }
-
   KOKKOS_FUNCTION constexpr void init(value_type& dest) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_init(
@@ -294,13 +259,26 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
     return m_value_view;
   }
 
-  KOKKOS_FUNCTION
-  constexpr static void write_value_back_to_original_references(
-      value_type const& value,
+  template <class ExecutionSpace, int Idx, class View>
+  static void write_one_value_back(
+      const ExecutionSpace& exec_space, View const& view,
+      typename View::const_value_type& value) noexcept {
+    if (Kokkos::SpaceAccessibility<typename View::memory_space,
+                                   Space>::assignable)
+      view() = value;
+    else
+      Kokkos::deep_copy(exec_space, view, value);
+  }
+
+  template <class ExecutionSpace>
+  static void write_value_back_to_original_references(
+      const ExecutionSpace& exec_space, value_type const& value,
       Reducers const&... reducers_that_reference_original_values) noexcept {
     emulate_fold_comma_operator(
-        (reducers_that_reference_original_values.view()() =
-             value.template get<Idxs, typename Reducers::value_type>())...);
+        (write_one_value_back<ExecutionSpace, Idxs>(
+             exec_space, reducers_that_reference_original_values.view(),
+             value.template get<Idxs, typename Reducers::value_type>()),
+         0)...);
   }
 };
 
@@ -441,9 +419,8 @@ struct CombinedReductionFunctorWrapper
 // <editor-fold desc="_make_reducer_from_arg"> {{{2
 
 template <class Space, class Reducer>
-KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if<
-    Kokkos::is_reducer<typename std::decay<Reducer>::type>::value,
-    typename std::decay<Reducer>::type>::type
+KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<
+    Kokkos::is_reducer<std::decay_t<Reducer>>::value, std::decay_t<Reducer>>
 _make_reducer_from_arg(Reducer&& arg_reducer) noexcept {
   return arg_reducer;
 }
@@ -456,21 +433,20 @@ struct _wrap_with_kokkos_sum {
 };
 
 template <class Space, class T>
-struct _wrap_with_kokkos_sum<
-    Space, T, typename std::enable_if<Kokkos::is_view<T>::value>::type> {
-  using type = Kokkos::Sum<typename T::value_type, Space>;
+struct _wrap_with_kokkos_sum<Space, T,
+                             std::enable_if_t<Kokkos::is_view<T>::value>> {
+  using type = Kokkos::Sum<typename T::value_type, typename T::memory_space>;
 };
 
 // TODO better error message for the case when a const& to a scalar is passed in
 //      (this is needed in general, though)
 template <class Space, class T>
-KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if<
-    !Kokkos::is_reducer<typename std::decay<T>::type>::value,
-    _wrap_with_kokkos_sum<Space, typename std::decay<T>::type>>::type::type
+KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if_t<
+    !Kokkos::is_reducer<std::decay_t<T>>::value,
+    _wrap_with_kokkos_sum<Space, std::decay_t<T>>>::type
 _make_reducer_from_arg(T&& arg_scalar) noexcept {
   return
-      typename _wrap_with_kokkos_sum<Space, typename std::decay<T>::type>::type{
-          arg_scalar};
+      typename _wrap_with_kokkos_sum<Space, std::decay_t<T>>::type{arg_scalar};
 }
 
 // This can't be an alias template because GCC doesn't know how to mangle
@@ -533,6 +509,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor(
   //----------------------------------------
 }
 
+template <typename FunctorType>
+using functor_has_value_t = typename FunctorType::value_type;
 }  // end namespace Impl
 
 //==============================================================================
@@ -546,9 +524,8 @@ template <class PolicyType, class Functor, class ReturnType1, class ReturnType2,
 auto parallel_reduce(std::string const& label, PolicyType const& policy,
                      Functor const& functor, ReturnType1&& returnType1,
                      ReturnType2&& returnType2,
-                     ReturnTypes&&... returnTypes) noexcept ->
-    typename std::enable_if<
-        Kokkos::is_execution_policy<PolicyType>::value>::type {
+                     ReturnTypes&&... returnTypes) noexcept
+    -> std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value> {
   //----------------------------------------
   // Since we don't support asynchronous combined reducers yet for various
   // reasons, we actually just want to work with the pointers and references
@@ -570,7 +547,7 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
 
   using combined_functor_type = decltype(combined_functor);
   static_assert(
-      Impl::FunctorDeclaresValueType<combined_functor_type, void>::value,
+      is_detected<Impl::functor_has_value_t, combined_functor_type>::value,
       "value_type not properly detected");
   using reduce_adaptor_t =
       Impl::ParallelReduceAdaptor<PolicyType, combined_functor_type,
@@ -584,9 +561,12 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
           "Kokkos::parallel_reduce: fence due to result being value, not view",
           combined_reducer);
   combined_reducer.write_value_back_to_original_references(
-      value, Impl::_make_reducer_from_arg<space_type>(returnType1),
+      policy.space(), value,
+      Impl::_make_reducer_from_arg<space_type>(returnType1),
       Impl::_make_reducer_from_arg<space_type>(returnType2),
       Impl::_make_reducer_from_arg<space_type>(returnTypes)...);
+  policy.space().fence(
+      "Kokkos::parallel_reduce: fence after copying values back");
   //----------------------------------------
 }
 
@@ -594,9 +574,8 @@ template <class PolicyType, class Functor, class ReturnType1, class ReturnType2,
           class... ReturnTypes>
 auto parallel_reduce(PolicyType const& policy, Functor const& functor,
                      ReturnType1&& returnType1, ReturnType2&& returnType2,
-                     ReturnTypes&&... returnTypes) noexcept ->
-    typename std::enable_if<
-        Kokkos::is_execution_policy<PolicyType>::value>::type {
+                     ReturnTypes&&... returnTypes) noexcept
+    -> std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value> {
   //----------------------------------------
   Kokkos::parallel_reduce("", policy, functor,
                           std::forward<ReturnType1>(returnType1),
diff --git a/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.cpp b/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.cpp
index edaae0fd4..ca56352f4 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.cpp
@@ -41,24 +41,28 @@
 // ************************************************************************
 //@HEADER
 */
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
+#include <impl/Kokkos_Command_Line_Parsing.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <cstring>
 #include <iostream>
+#include <regex>
 #include <string>
 #include <sstream>
-#include <cstring>
-#include <impl/Kokkos_Command_Line_Parsing.hpp>
-/** Duplicates of Kokkos_Error.cpp/hpp, reproduced here
- * for use in non-Kokkos applications
- */
+
 namespace {
-void traceback_callstack(std::ostream& msg) {
-  msg << std::endl << "Traceback functionality not available" << std::endl;
-}
-void throw_runtime_exception(const std::string& msg) {
-  std::ostringstream o;
-  o << msg;
-  traceback_callstack(o);
-  throw std::runtime_error(o.str());
-}
+
+auto const regex_true = std::regex(
+    "(yes|true|1)", std::regex_constants::icase | std::regex_constants::egrep);
+
+auto const regex_false = std::regex(
+    "(no|false|0)", std::regex_constants::icase | std::regex_constants::egrep);
+
 }  // namespace
 
 bool Kokkos::Impl::is_unsigned_int(const char* str) {
@@ -85,49 +89,204 @@ bool Kokkos::Impl::check_arg(char const* arg, char const* expected) {
   return true;
 }
 
-bool Kokkos::Impl::check_int_arg(char const* arg, char const* expected,
-                                 int* value) {
-  if (!check_arg(arg, expected)) return false;
-  std::size_t arg_len = std::strlen(arg);
-  std::size_t exp_len = std::strlen(expected);
-  bool okay           = true;
-  if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
-  char const* number = arg + exp_len + 1;
-  if (!Kokkos::Impl::is_unsigned_int(number) || strlen(number) == 0)
-    okay = false;
-  *value = std::stoi(number);
-  if (!okay) {
-    std::ostringstream ss;
-    ss << "Error: expecting an '=INT' after command line argument '" << expected
-       << "'";
-    ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
-    throw_runtime_exception(ss.str());
+bool Kokkos::Impl::check_env_bool(char const* name, bool& val) {
+  char const* var = std::getenv(name);
+
+  if (!var) {
+    return false;
+  }
+
+  if (std::regex_match(var, regex_true)) {
+    val = true;
+    return true;
+  }
+
+  if (!std::regex_match(var, regex_false)) {
+    std::stringstream ss;
+    ss << "Error: cannot convert environment variable '" << name << "=" << var
+       << "' to a boolean."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
   }
+
+  val = false;
   return true;
 }
-bool Kokkos::Impl::check_str_arg(char const* arg, char const* expected,
-                                 std::string& value) {
-  if (!check_arg(arg, expected)) return false;
-  std::size_t arg_len = std::strlen(arg);
-  std::size_t exp_len = std::strlen(expected);
-  bool okay           = true;
-  if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
-  char const* remain = arg + exp_len + 1;
-  value              = remain;
-  if (!okay) {
-    std::ostringstream ss;
-    ss << "Error: expecting an '=STRING' after command line argument '"
-       << expected << "'";
-    ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
-    throw_runtime_exception(ss.str());
+
+bool Kokkos::Impl::check_env_int(char const* name, int& val) {
+  char const* var = std::getenv(name);
+
+  if (!var) {
+    return false;
+  }
+
+  errno = 0;
+  char* var_end;
+  val = std::strtol(var, &var_end, 10);
+
+  if (var == var_end) {
+    std::stringstream ss;
+    ss << "Error: cannot convert environment variable '" << name << '=' << var
+       << "' to an integer."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  if (errno == ERANGE) {
+    std::stringstream ss;
+    ss << "Error: converted value for environment variable '" << name << '='
+       << var << "' falls out of range."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  return true;
+}
+
+bool Kokkos::Impl::check_arg_bool(char const* arg, char const* name,
+                                  bool& val) {
+  auto const len = std::strlen(name);
+  if (std::strncmp(arg, name, len) != 0) {
+    return false;
+  }
+  auto const arg_len = strlen(arg);
+  if (arg_len == len) {
+    val = true;  // --kokkos-foo without =BOOL interpreted as fool=true
+    return true;
+  }
+  if (arg_len <= len + 1 || arg[len] != '=') {
+    std::stringstream ss;
+    ss << "Error: command line argument '" << arg
+       << "' is not recognized as a valid boolean."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
   }
+
+  std::advance(arg, len + 1);
+  if (std::regex_match(arg, regex_true)) {
+    val = true;
+    return true;
+  }
+  if (!std::regex_match(arg, regex_false)) {
+    std::stringstream ss;
+    ss << "Error: cannot convert command line argument '" << name << "=" << arg
+       << "' to a boolean."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+  val = false;
+  return true;
+}
+
+bool Kokkos::Impl::check_arg_int(char const* arg, char const* name, int& val) {
+  auto const len = std::strlen(name);
+  if (std::strncmp(arg, name, len) != 0) {
+    return false;
+  }
+  auto const arg_len = strlen(arg);
+  if (arg_len <= len + 1 || arg[len] != '=') {
+    std::stringstream ss;
+    ss << "Error: command line argument '" << arg
+       << "' is not recognized as a valid integer."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  std::advance(arg, len + 1);
+
+  errno = 0;
+  char* arg_end;
+  val = std::strtol(arg, &arg_end, 10);
+
+  if (arg == arg_end) {
+    std::stringstream ss;
+    ss << "Error: cannot convert command line argument '" << name << '=' << arg
+       << "' to an integer."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  if (errno == ERANGE) {
+    std::stringstream ss;
+    ss << "Error: converted value for command line argument '" << name << '='
+       << arg << "' falls out of range."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  return true;
+}
+
+bool Kokkos::Impl::check_arg_str(char const* arg, char const* name,
+                                 std::string& val) {
+  auto const len = std::strlen(name);
+  if (std::strncmp(arg, name, len) != 0) {
+    return false;
+  }
+  auto const arg_len = strlen(arg);
+  if (arg_len <= len + 1 || arg[len] != '=') {
+    std::stringstream ss;
+    ss << "Error: command line argument '" << arg
+       << "' is not recognized as a valid string."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  std::advance(arg, len + 1);
+
+  val = arg;
   return true;
 }
-void Kokkos::Impl::warn_deprecated_command_line_argument(std::string deprecated,
-                                                         std::string valid) {
-  std::cerr
-      << "Warning: command line argument '" << deprecated
-      << "' is deprecated. Use '" << valid
-      << "' instead. Raised by Kokkos::initialize(int narg, char* argc[])."
-      << std::endl;
+
+void Kokkos::Impl::warn_deprecated_environment_variable(
+    std::string deprecated) {
+  std::cerr << "Warning: environment variable '" << deprecated
+            << "' is deprecated."
+            << " Raised by Kokkos::initialize()." << std::endl;
+}
+
+void Kokkos::Impl::warn_deprecated_environment_variable(
+    std::string deprecated, std::string use_instead) {
+  std::cerr << "Warning: environment variable '" << deprecated
+            << "' is deprecated."
+            << " Use '" << use_instead << "' instead."
+            << " Raised by Kokkos::initialize()." << std::endl;
+}
+
+void Kokkos::Impl::warn_deprecated_command_line_argument(
+    std::string deprecated) {
+  std::cerr << "Warning: command line argument '" << deprecated
+            << "' is deprecated."
+            << " Raised by Kokkos::initialize()." << std::endl;
+}
+
+void Kokkos::Impl::warn_deprecated_command_line_argument(
+    std::string deprecated, std::string use_instead) {
+  std::cerr << "Warning: command line argument '" << deprecated
+            << "' is deprecated."
+            << " Use '" << use_instead << "' instead."
+            << " Raised by Kokkos::initialize()." << std::endl;
+}
+
+namespace {
+std::vector<std::regex> do_not_warn_regular_expressions{
+    std::regex{"--kokkos-tool.*", std::regex::egrep},
+};
+}
+
+void Kokkos::Impl::do_not_warn_not_recognized_command_line_argument(
+    std::regex ignore) {
+  do_not_warn_regular_expressions.push_back(std::move(ignore));
+}
+
+void Kokkos::Impl::warn_not_recognized_command_line_argument(
+    std::string not_recognized) {
+  for (auto const& ignore : do_not_warn_regular_expressions) {
+    if (std::regex_match(not_recognized, ignore)) {
+      return;
+    }
+  }
+  std::cerr << "Warning: command line argument '" << not_recognized
+            << "' is not recognized."
+            << " Raised by Kokkos::initialize()." << std::endl;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.hpp b/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.hpp
index 7e1d3049e..b22bc3e34 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Command_Line_Parsing.hpp
@@ -46,17 +46,25 @@
 #define KOKKOS_COMMAND_LINE_PARSING_HPP
 
 #include <string>
-#include <iosfwd>
+#include <regex>
 
 namespace Kokkos {
 namespace Impl {
 bool is_unsigned_int(const char* str);
 bool check_arg(char const* arg, char const* expected);
-// void throw_runtime_exception(const std::string& msg);
-bool check_int_arg(char const* arg, char const* expected, int* value);
-bool check_str_arg(char const* arg, char const* expected, std::string& value);
+bool check_arg_bool(char const* arg, char const* name, bool& val);
+bool check_arg_int(char const* arg, char const* name, int& val);
+bool check_arg_str(char const* arg, char const* name, std::string& val);
+bool check_env_bool(char const* name, bool& val);
+bool check_env_int(char const* name, int& val);
+void warn_deprecated_environment_variable(std::string deprecated);
+void warn_deprecated_environment_variable(std::string deprecated,
+                                          std::string use_instead);
+void warn_deprecated_command_line_argument(std::string deprecated);
 void warn_deprecated_command_line_argument(std::string deprecated,
-                                           std::string valid);
+                                           std::string use_instead);
+void warn_not_recognized_command_line_argument(std::string not_recognized);
+void do_not_warn_not_recognized_command_line_argument(std::regex ignore);
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index 0a3b649fe..f624e7a14 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -42,10 +42,18 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_Command_Line_Parsing.hpp>
+#include <impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+#include <impl/Kokkos_ExecSpaceManager.hpp>
+
+#include <algorithm>
 #include <cctype>
 #include <cstring>
 #include <iostream>
@@ -55,6 +63,7 @@
 #include <functional>
 #include <list>
 #include <cerrno>
+#include <random>
 #include <regex>
 #ifndef _WIN32
 #include <unistd.h>
@@ -65,6 +74,7 @@
 //----------------------------------------------------------------------------
 namespace {
 bool g_is_initialized = false;
+bool g_is_finalized   = false;
 bool g_show_warnings  = true;
 bool g_tune_internals = false;
 // When compiling with clang/LLVM and using the GNU (GCC) C++ Standard Library
@@ -78,10 +88,7 @@ bool g_tune_internals = false;
 // segmented array.
 using hook_function_type = std::function<void()>;
 std::stack<hook_function_type, std::list<hook_function_type>> finalize_hooks;
-}  // namespace
 
-namespace Kokkos {
-namespace Impl {
 /**
  * The category is only used in printing, tools
  * get all metadata free of category
@@ -100,50 +107,129 @@ void declare_configuration_metadata(const std::string& category,
   metadata_map[category][key] = value;
 }
 
-ExecSpaceManager& ExecSpaceManager::get_instance() {
+void combine(Kokkos::InitializationSettings& out,
+             Kokkos::InitializationSettings const& in) {
+#define KOKKOS_IMPL_COMBINE_SETTING(NAME) \
+  if (in.has_##NAME()) {                  \
+    out.set_##NAME(in.get_##NAME());      \
+  }                                       \
+  static_assert(true, "no-op to require trailing semicolon")
+  KOKKOS_IMPL_COMBINE_SETTING(num_threads);
+  KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by);
+  KOKKOS_IMPL_COMBINE_SETTING(device_id);
+  KOKKOS_IMPL_COMBINE_SETTING(num_devices);
+  KOKKOS_IMPL_COMBINE_SETTING(skip_device);
+  KOKKOS_IMPL_COMBINE_SETTING(disable_warnings);
+  KOKKOS_IMPL_COMBINE_SETTING(tune_internals);
+  KOKKOS_IMPL_COMBINE_SETTING(tools_help);
+  KOKKOS_IMPL_COMBINE_SETTING(tools_libs);
+  KOKKOS_IMPL_COMBINE_SETTING(tools_args);
+#undef KOKKOS_IMPL_COMBINE_SETTING
+}
+
+void combine(Kokkos::InitializationSettings& out,
+             Kokkos::Tools::InitArguments const& in) {
+  using Kokkos::Tools::InitArguments;
+  if (in.help != InitArguments::PossiblyUnsetOption::unset) {
+    out.set_tools_help(in.help == InitArguments::PossiblyUnsetOption::on);
+  }
+  if (in.lib != InitArguments::unset_string_option) {
+    out.set_tools_libs(in.lib);
+  }
+  if (in.args != InitArguments::unset_string_option) {
+    out.set_tools_args(in.args);
+  }
+}
+
+void combine(Kokkos::Tools::InitArguments& out,
+             Kokkos::InitializationSettings const& in) {
+  using Kokkos::Tools::InitArguments;
+  if (in.has_tools_help()) {
+    out.help = in.get_tools_help() ? InitArguments::PossiblyUnsetOption::on
+                                   : InitArguments::PossiblyUnsetOption::off;
+  }
+  if (in.has_tools_libs()) {
+    out.lib = in.get_tools_libs();
+  }
+  if (in.has_tools_args()) {
+    out.args = in.get_tools_args();
+  }
+}
+
+int get_device_count() {
+#if defined(KOKKOS_ENABLE_CUDA)
+  return Kokkos::Cuda::detect_device_count();
+#elif defined(KOKKOS_ENABLE_HIP)
+  return Kokkos::Experimental::HIP::detect_device_count();
+#elif defined(KOKKOS_ENABLE_SYCL)
+  return sycl::device::get_devices(sycl::info::device_type::gpu).size();
+#elif defined(KOKKOS_ENABLE_OPENACC)
+  return acc_get_num_devices(
+      Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
+#else
+  Kokkos::abort("implementation bug");
+  return -1;
+#endif
+}
+
+unsigned get_process_id() {
+#ifdef _WIN32
+  return unsigned(GetCurrentProcessId());
+#else
+  return unsigned(getpid());
+#endif
+}
+
+bool is_valid_num_threads(int x) { return x > 0; }
+
+bool is_valid_device_id(int x) { return x >= 0; }
+
+bool is_valid_map_device_id_by(std::string const& x) {
+  return x == "mpi_rank" || x == "random";
+}
+
+}  // namespace
+
+Kokkos::Impl::ExecSpaceManager& Kokkos::Impl::ExecSpaceManager::get_instance() {
   static ExecSpaceManager space_initializer = {};
   return space_initializer;
 }
 
-void ExecSpaceManager::register_space_factory(
-    const std::string name, std::unique_ptr<ExecSpaceInitializerBase> space) {
+void Kokkos::Impl::ExecSpaceManager::register_space_factory(
+    const std::string name, std::unique_ptr<ExecSpaceBase> space) {
   exec_space_factory_list[name] = std::move(space);
 }
 
-void ExecSpaceManager::initialize_spaces(const Kokkos::InitArguments& args) {
+void Kokkos::Impl::ExecSpaceManager::initialize_spaces(
+    const InitializationSettings& settings) {
   // Note: the names of the execution spaces, used as keys in the map, encode
-  // the ordering of the initialization code from the old initializtion stuff.
+  // the ordering of the initialization code from the old initialization stuff.
   // Eventually, we may want to do something less brittle than this, but for now
   // we're just preserving compatibility with the old implementation.
   for (auto& to_init : exec_space_factory_list) {
-    to_init.second->initialize(args);
+    to_init.second->initialize(settings);
   }
 }
 
-void ExecSpaceManager::finalize_spaces(const bool all_spaces) {
+void Kokkos::Impl::ExecSpaceManager::finalize_spaces() {
   for (auto& to_finalize : exec_space_factory_list) {
-    to_finalize.second->finalize(all_spaces);
+    to_finalize.second->finalize();
   }
 }
 
-void ExecSpaceManager::static_fence() {
-  for (auto& to_fence : exec_space_factory_list) {
-    to_fence.second->fence();
-  }
-}
-void ExecSpaceManager::static_fence(const std::string& name) {
+void Kokkos::Impl::ExecSpaceManager::static_fence(const std::string& name) {
   for (auto& to_fence : exec_space_factory_list) {
-    to_fence.second->fence(name);
+    to_fence.second->static_fence(name);
   }
 }
-void ExecSpaceManager::print_configuration(std::ostream& msg,
-                                           const bool detail) {
-  for (auto& to_print : exec_space_factory_list) {
-    to_print.second->print_configuration(msg, detail);
+void Kokkos::Impl::ExecSpaceManager::print_configuration(std::ostream& os,
+                                                         bool verbose) {
+  for (auto const& to_print : exec_space_factory_list) {
+    to_print.second->print_configuration(os, verbose);
   }
 }
 
-int get_ctest_gpu(const char* local_rank_str) {
+int Kokkos::Impl::get_ctest_gpu(const char* local_rank_str) {
   auto const* ctest_kokkos_device_type =
       std::getenv("CTEST_KOKKOS_DEVICE_TYPE");
   if (!ctest_kokkos_device_type) {
@@ -232,61 +318,142 @@ int get_ctest_gpu(const char* local_rank_str) {
   return std::stoi(id.c_str());
 }
 
-// function to extract gpu # from args
-int get_gpu(const InitArguments& args) {
-  int use_gpu        = args.device_id;
-  const int ndevices = [](int num_devices) -> int {
-    if (num_devices > 0) return num_devices;
-#if defined(KOKKOS_ENABLE_CUDA)
-    return Cuda::detect_device_count();
-#elif defined(KOKKOS_ENABLE_HIP)
-    return Experimental::HIP::detect_device_count();
-#elif defined(KOKKOS_ENABLE_SYCL)
-    return sycl::device::get_devices(sycl::info::device_type::gpu).size();
-#else
-    return num_devices;
-#endif
-  }(args.ndevices);
-  const int skip_device = args.skip_device;
-
-  // if the exact device is not set, but ndevices was given, assign round-robin
-  // using on-node MPI rank
-  if (use_gpu < 0) {
-    auto const* local_rank_str =
-        std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");  // OpenMPI
-    if (!local_rank_str)
-      local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
-    if (!local_rank_str)
-      local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
-
-    auto const* ctest_kokkos_device_type =
-        std::getenv("CTEST_KOKKOS_DEVICE_TYPE");  // CTest
-    auto const* ctest_resource_group_count_str =
-        std::getenv("CTEST_RESOURCE_GROUP_COUNT");  // CTest
-    if (ctest_kokkos_device_type && ctest_resource_group_count_str &&
-        local_rank_str) {
-      // Use the device assigned by CTest
-      use_gpu = get_ctest_gpu(local_rank_str);
-    } else if (ndevices > 0) {
-      // Use the device assigned by the rank
-      if (local_rank_str) {
-        auto local_rank = std::stoi(local_rank_str);
-        use_gpu         = local_rank % ndevices;
-      } else {
-        // user only gave use ndevices, but the MPI environment variable wasn't
-        // set. start with GPU 0 at this point
-        use_gpu = 0;
+std::vector<int> Kokkos::Impl::get_visible_devices(
+    Kokkos::InitializationSettings const& settings, int device_count) {
+  std::vector<int> visible_devices;
+  char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES");
+  if (env_visible_devices) {
+    std::stringstream ss(env_visible_devices);
+    for (int i; ss >> i;) {
+      visible_devices.push_back(i);
+      if (ss.peek() == ',') ss.ignore();
+    }
+    for (auto id : visible_devices) {
+      if (id < 0) {
+        ss << "Error: Invalid device id '" << id
+           << "' in environment variable 'KOKKOS_VISIBLE_DEVICES="
+           << env_visible_devices << "'."
+           << " Device id cannot be negative!"
+           << " Raised by Kokkos::initialize().\n";
+      }
+      if (id >= device_count) {
+        ss << "Error: Invalid device id '" << id
+           << "' in environment variable 'KOKKOS_VISIBLE_DEVICES="
+           << env_visible_devices << "'."
+           << " Device id must be smaller than the number of GPUs available"
+           << " for execution '" << device_count << "'!"
+           << " Raised by Kokkos::initialize().\n";
+      }
+    }
+  } else {
+    int num_devices =
+        settings.has_num_devices() ? settings.get_num_devices() : device_count;
+    if (num_devices > device_count) {
+      std::stringstream ss;
+      ss << "Error: Specified number of devices '" << num_devices
+         << "' exceeds the actual number of GPUs available for execution '"
+         << device_count << "'."
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
+    }
+    for (int i = 0; i < num_devices; ++i) {
+      visible_devices.push_back(i);
+    }
+    if (settings.has_skip_device()) {
+      if (visible_devices.size() == 1 && settings.get_skip_device() == 0) {
+        Kokkos::abort(
+            "Error: skipping the only GPU available for execution.\n"
+            " Raised by Kokkos::initialize().\n");
       }
+      visible_devices.erase(
+          std::remove(visible_devices.begin(), visible_devices.end(),
+                      settings.get_skip_device()),
+          visible_devices.end());
+    }
+  }
+  if (visible_devices.empty()) {
+    Kokkos::abort(
+        "Error: no GPU available for execution.\n"
+        " Raised by Kokkos::initialize().\n");
+  }
+  return visible_devices;
+}
+
+int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
+  std::vector<int> visible_devices =
+      get_visible_devices(settings, get_device_count());
+  int const num_devices = visible_devices.size();
+  // device_id is provided
+  if (settings.has_device_id()) {
+    int const id = settings.get_device_id();
+    if (id < 0) {
+      std::stringstream ss;
+      ss << "Error: Requested GPU with invalid id '" << id << "'."
+         << " Device id cannot be negative!"
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
+    }
+    if (id >= num_devices) {
+      std::stringstream ss;
+      ss << "Error: Requested GPU with id '" << id << "' but only "
+         << num_devices << "GPU(s) available!"
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
     }
-    // shift assignments over by one so no one is assigned to "skip_device"
-    if (use_gpu >= skip_device) ++use_gpu;
+    return visible_devices[settings.get_device_id()];
   }
-  return use_gpu;
+
+  // either random or round-robin assignment based on local MPI rank
+  if (settings.has_map_device_id_by() &&
+      !is_valid_map_device_id_by(settings.get_map_device_id_by())) {
+    std::stringstream ss;
+    ss << "Error: map_device_id_by setting '" << settings.get_map_device_id_by()
+       << "' is not recognized."
+       << " Raised by Kokkos::initialize().\n";
+    Kokkos::abort(ss.str().c_str());
+  }
+
+  if (settings.has_map_device_id_by() &&
+      settings.get_map_device_id_by() == "random") {
+    std::default_random_engine gen(get_process_id());
+    std::uniform_int_distribution<int> distribution(0, num_devices - 1);
+    return visible_devices[distribution(gen)];
+  }
+
+  // either map_device_id_by is not specified or it is mpi_rank
+  if (settings.has_map_device_id_by() &&
+      settings.get_map_device_id_by() != "mpi_rank") {
+    Kokkos::abort("implementation bug");
+  }
+
+  auto const* local_rank_str =
+      std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");  // OpenMPI
+  if (!local_rank_str)
+    local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
+  if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
+
+  // use first GPU available for execution if unable to detect local MPI rank
+  if (!local_rank_str) {
+    if (settings.has_map_device_id_by()) {
+      std::cerr << "Warning: unable to detect local MPI rank."
+                << " Falling back to the first GPU available for execution."
+                << " Raised by Kokkos::initialize()." << std::endl;
+    }
+    return visible_devices[0];
+  }
+
+  // use device assigned by CTest when resource allocation is activated
+  if (std::getenv("CTEST_KOKKOS_DEVICE_TYPE") &&
+      std::getenv("CTEST_RESOURCE_GROUP_COUNT")) {
+    return get_ctest_gpu(local_rank_str);
+  }
+
+  return visible_devices[std::stoi(local_rank_str) % visible_devices.size()];
 }
 
 namespace {
 
-void initialize_backends(const InitArguments& args) {
+void initialize_backends(const Kokkos::InitializationSettings& settings) {
 // This is an experimental setting
 // For KNL in Flat mode this variable should be set, so that
 // memkind allocates high bandwidth memory correctly.
@@ -294,10 +461,10 @@ void initialize_backends(const InitArguments& args) {
   setenv("MEMKIND_HBW_NODES", "1", 0);
 #endif
 
-  Impl::ExecSpaceManager::get_instance().initialize_spaces(args);
+  Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings);
 }
 
-void initialize_profiling(const Tools::InitArguments& args) {
+void initialize_profiling(const Kokkos::Tools::InitArguments& args) {
   auto initialization_status =
       Kokkos::Tools::Impl::initialize_tools_subsystem(args);
   if (initialization_status.result ==
@@ -310,7 +477,7 @@ void initialize_profiling(const Tools::InitArguments& args) {
              Kokkos::Tools::Impl::InitializationStatus::InitializationResult::
                  success) {
     Kokkos::Tools::parseArgs(args.args);
-    for (const auto& category_value : Kokkos::Impl::metadata_map) {
+    for (const auto& category_value : metadata_map) {
       for (const auto& key_value : category_value.second) {
         Kokkos::Tools::declareMetadata(key_value.first, key_value.second);
       }
@@ -329,9 +496,12 @@ std::string version_string_from_int(int version_number) {
               << "." << version_number % 100;
   return str_builder.str();
 }
-void pre_initialize_internal(const InitArguments& args) {
-  if (args.disable_warnings) g_show_warnings = false;
-  if (args.tune_internals) g_tune_internals = true;
+
+void pre_initialize_internal(const Kokkos::InitializationSettings& settings) {
+  if (settings.has_disable_warnings() && settings.get_disable_warnings())
+    g_show_warnings = false;
+  if (settings.has_tune_internals() && settings.get_tune_internals())
+    g_tune_internals = true;
   declare_configuration_metadata("version_info", "Kokkos Version",
                                  version_string_from_int(KOKKOS_VERSION));
 #ifdef KOKKOS_COMPILER_APPLECC
@@ -379,26 +549,6 @@ void pre_initialize_internal(const InitArguments& args) {
                                  std::to_string(KOKKOS_COMPILER_MSVC));
   declare_configuration_metadata("tools_only", "compiler_family", "msvc");
 #endif
-#ifdef KOKKOS_ENABLE_ISA_KNC
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC",
-                                 "yes");
-#else
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC", "no");
-#endif
-#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE",
-                                 "yes");
-#else
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE",
-                                 "no");
-#endif
-#ifdef KOKKOS_ENABLE_ISA_X86_64
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64",
-                                 "yes");
-#else
-  declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64",
-                                 "no");
-#endif
 
 #ifdef KOKKOS_ENABLE_GNU_ATOMICS
   declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "yes");
@@ -468,13 +618,6 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC",
                                  "no");
 #endif
-#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN",
-                                 "yes");
-#else
-  declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN",
-                                 "no");
-#endif
 
 #ifdef KOKKOS_ENABLE_ASM
   declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes");
@@ -512,23 +655,34 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
+#endif
+#ifdef KOKKOS_ENABLE_LIBDL
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes");
+#else
+  declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "no");
 #endif
   declare_configuration_metadata("architecture", "Default Device",
                                  typeid(Kokkos::DefaultExecutionSpace).name());
 }
 
-void post_initialize_internal(const InitArguments& args) {
-  initialize_profiling(args.impl_get_tools_init_arguments());
+void post_initialize_internal(const Kokkos::InitializationSettings& settings) {
+  Kokkos::Tools::InitArguments tools_init_arguments;
+  combine(tools_init_arguments, settings);
+  initialize_profiling(tools_init_arguments);
   g_is_initialized = true;
+  if (settings.has_print_configuration() &&
+      settings.get_print_configuration()) {
+    ::Kokkos::print_configuration(std::cout);
+  }
 }
 
-void initialize_internal(const InitArguments& args) {
-  pre_initialize_internal(args);
-  initialize_backends(args);
-  post_initialize_internal(args);
+void initialize_internal(const Kokkos::InitializationSettings& settings) {
+  pre_initialize_internal(settings);
+  initialize_backends(settings);
+  post_initialize_internal(settings);
 }
 
-void finalize_internal(const bool all_spaces = false) {
+void finalize_internal() {
   typename decltype(finalize_hooks)::size_type numSuccessfulCalls = 0;
   while (!finalize_hooks.empty()) {
     auto f = finalize_hooks.top();
@@ -558,124 +712,168 @@ void finalize_internal(const bool all_spaces = false) {
 
   Kokkos::Profiling::finalize();
 
-  Impl::ExecSpaceManager::get_instance().finalize_spaces(all_spaces);
+  Kokkos::Impl::ExecSpaceManager::get_instance().finalize_spaces();
 
   g_is_initialized = false;
+  g_is_finalized   = true;
   g_show_warnings  = true;
   g_tune_internals = false;
 }
 
 void fence_internal(const std::string& name) {
-  Impl::ExecSpaceManager::get_instance().static_fence(name);
+  Kokkos::Impl::ExecSpaceManager::get_instance().static_fence(name);
 }
 
-unsigned get_process_id() {
-#ifdef _WIN32
-  return unsigned(GetCurrentProcessId());
-#else
-  return unsigned(getpid());
-#endif
+void print_help_message() {
+  auto const help_message = R"(
+--------------------------------------------------------------------------------
+-------------Kokkos command line arguments--------------------------------------
+--------------------------------------------------------------------------------
+This program is using Kokkos.  You can use the following command line flags to
+control its behavior:
+
+Kokkos Core Options:
+  --kokkos-help                  : print this message
+  --kokkos-disable-warnings      : disable kokkos warning messages
+  --kokkos-print-configuration   : print configuration
+  --kokkos-tune-internals        : allow Kokkos to autotune policies and declare
+                                   tuning features through the tuning system. If
+                                   left off, Kokkos uses heuristics
+  --kokkos-num-threads=INT       : specify total number of threads to use for
+                                   parallel regions on the host.
+  --kokkos-device-id=INT         : specify device id to be used by Kokkos.
+  --kokkos-map-device-id-by=(random|mpi_rank)
+                                 : strategy to select device-id automatically from
+                                   available devices.
+                                   - random:   choose a random device from available.
+                                   - mpi_rank: choose device-id based on a round robin
+                                               assignment of local MPI ranks.
+                                               Works with OpenMPI, MVAPICH, SLURM, and
+                                               derived implementations.
+
+Kokkos Tools Options:
+  --kokkos-tools-libs=STR        : Specify which of the tools to use. Must either
+                                   be full path to library or name of library if the
+                                   path is present in the runtime library search path
+                                   (e.g. LD_LIBRARY_PATH)
+  --kokkos-tools-help            : Query the (loaded) kokkos-tool for its command-line
+                                   option support (which should then be passed via
+                                   --kokkos-tools-args="...")
+  --kokkos-tools-args=STR        : A single (quoted) string of options which will be
+                                   whitespace delimited and passed to the loaded
+                                   kokkos-tool as command-line arguments. E.g.
+                                   `<EXE> --kokkos-tools-args="-c input.txt"` will
+                                   pass `<EXE> -c input.txt` as argc/argv to tool
+
+Except for --kokkos[-tools]-help, you can alternatively set the corresponding
+environment variable of a flag (all letters in upper-case and underscores
+instead of hyphens). For example, to disable warning messages, you can either
+specify --kokkos-disable-warnings or set the KOKKOS_DISABLE_WARNINGS
+environment variable to yes.
+
+Join us on Slack, visit https://kokkosteam.slack.com
+Report bugs to https://github.com/kokkos/kokkos/issues
+--------------------------------------------------------------------------------
+)";
+  std::cout << help_message << std::endl;
 }
 
-void parse_command_line_arguments(int& narg, char* arg[],
-                                  InitArguments& arguments) {
-  auto& num_threads      = arguments.num_threads;
-  auto& numa             = arguments.num_numa;
-  auto& device           = arguments.device_id;
-  auto& ndevices         = arguments.ndevices;
-  auto& skip_device      = arguments.skip_device;
-  auto& disable_warnings = arguments.disable_warnings;
-  auto& tune_internals   = arguments.tune_internals;
-  auto& tool_help        = arguments.tool_help;
-  auto& tool_args        = arguments.tool_args;
-  auto& tool_lib         = arguments.tool_lib;
-
-  bool kokkos_threads_found  = false;
-  bool kokkos_numa_found     = false;
-  bool kokkos_device_found   = false;
-  bool kokkos_ndevices_found = false;
-  auto tools_init_arguments  = arguments.impl_get_tools_init_arguments();
-  Tools::Impl::parse_command_line_arguments(narg, arg, tools_init_arguments);
-  if (tools_init_arguments.tune_internals !=
-      Kokkos::Tools::InitArguments::PossiblyUnsetOption::unset) {
-    tune_internals = (tools_init_arguments.tune_internals ==
-                      Kokkos::Tools::InitArguments::PossiblyUnsetOption::on);
-  }
-  if (tools_init_arguments.help !=
-      Kokkos::Tools::InitArguments::PossiblyUnsetOption::unset) {
-    tool_help = (tools_init_arguments.help ==
-                 Kokkos::Tools::InitArguments::PossiblyUnsetOption::on);
-  }
-  if (tools_init_arguments.lib !=
-      Kokkos::Tools::InitArguments::unset_string_option) {
-    tool_lib = tools_init_arguments.lib;
-  }
-  if (tools_init_arguments.args !=
-      Kokkos::Tools::InitArguments::unset_string_option) {
-    tool_args = tools_init_arguments.args;
-  }
+}  // namespace
 
-  int iarg = 0;
+void Kokkos::Impl::parse_command_line_arguments(
+    int& argc, char* argv[], InitializationSettings& settings) {
+  Tools::InitArguments tools_init_arguments;
+  combine(tools_init_arguments, settings);
+  Tools::Impl::parse_command_line_arguments(argc, argv, tools_init_arguments);
+  combine(settings, tools_init_arguments);
+
+  int num_threads;
+  int device_id;
+  int num_devices;  // deprecated
+  int skip_device;  // deprecated
+  std::string map_device_id_by;
+  bool disable_warnings;
+  bool print_configuration;
+  bool tune_internals;
 
-  while (iarg < narg) {
-    if (check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) {
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+  auto get_flag = [](std::string s) -> std::string {
+    return s.erase(s.find('='));
+  };
+
+  bool help_flag = false;
+
+  int iarg = 0;
+  while (iarg < argc) {
+    bool remove_flag = false;
+
+    if (check_arg(argv[iarg], "--kokkos-numa") ||
+        check_arg(argv[iarg], "--numa")) {
+      warn_deprecated_command_line_argument(get_flag(argv[iarg]));
+      // remove flag if prefixed with '--kokkos-'
+      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+    } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) ||
+               check_arg_int(argv[iarg], "--num-threads", num_threads) ||
+               check_arg_int(argv[iarg], "--kokkos-threads", num_threads) ||
+               check_arg_int(argv[iarg], "--threads", num_threads)) {
+      if (get_flag(argv[iarg]) != "--kokkos-num-threads") {
+        warn_deprecated_command_line_argument(get_flag(argv[iarg]),
+                                              "--kokkos-num-threads");
       }
-      kokkos_threads_found = true;
-      narg--;
-    } else if (!kokkos_threads_found &&
-               check_int_arg(arg[iarg], "--threads", &num_threads)) {
-      iarg++;
-    } else if (check_int_arg(arg[iarg], "--kokkos-numa", &numa)) {
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+      if (!is_valid_num_threads(num_threads)) {
+        std::stringstream ss;
+        ss << "Error: command line argument '" << argv[iarg] << "' is invalid."
+           << " The number of threads must be greater than or equal to one."
+           << " Raised by Kokkos::initialize().\n";
+        Kokkos::abort(ss.str().c_str());
       }
-      kokkos_numa_found = true;
-      narg--;
-    } else if (!kokkos_numa_found &&
-               check_int_arg(arg[iarg], "--numa", &numa)) {
-      iarg++;
-    } else if (check_int_arg(arg[iarg], "--kokkos-device-id", &device) ||
-               check_int_arg(arg[iarg], "--kokkos-device", &device)) {
-      if (check_arg(arg[iarg], "--kokkos-device")) {
-        warn_deprecated_command_line_argument("--kokkos-device",
+      settings.set_num_threads(num_threads);
+      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+    } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) ||
+               check_arg_int(argv[iarg], "--device-id", device_id) ||
+               check_arg_int(argv[iarg], "--kokkos-device", device_id) ||
+               check_arg_int(argv[iarg], "--device", device_id)) {
+      if (get_flag(argv[iarg]) != "--kokkos-device-id") {
+        warn_deprecated_command_line_argument(get_flag(argv[iarg]),
                                               "--kokkos-device-id");
       }
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+      if (!is_valid_device_id(device_id)) {
+        std::stringstream ss;
+        ss << "Error: command line argument '" << argv[iarg] << "' is invalid."
+           << " The device id must be greater than or equal to zero."
+           << " Raised by Kokkos::initialize().\n";
+        Kokkos::abort(ss.str().c_str());
       }
-      kokkos_device_found = true;
-      narg--;
-    } else if (!kokkos_device_found &&
-               (check_int_arg(arg[iarg], "--device-id", &device) ||
-                check_int_arg(arg[iarg], "--device", &device))) {
-      if (check_arg(arg[iarg], "--device")) {
-        warn_deprecated_command_line_argument("--device", "--device-id");
+      settings.set_device_id(device_id);
+      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+    } else if (check_arg(argv[iarg], "--kokkos-num-devices") ||
+               check_arg(argv[iarg], "--num-devices") ||
+               check_arg(argv[iarg], "--kokkos-ndevices") ||
+               check_arg(argv[iarg], "--ndevices")) {
+      if (check_arg(argv[iarg], "--num-devices")) {
+        warn_deprecated_command_line_argument("--num-devices",
+                                              "--kokkos-num-devices");
       }
-      iarg++;
-    } else if (check_arg(arg[iarg], "--kokkos-num-devices") ||
-               check_arg(arg[iarg], "--num-devices") ||
-               check_arg(arg[iarg], "--kokkos-ndevices") ||
-               check_arg(arg[iarg], "--ndevices")) {
-      if (check_arg(arg[iarg], "--ndevices")) {
-        warn_deprecated_command_line_argument("--ndevices", "--num-devices");
+      if (check_arg(argv[iarg], "--ndevices")) {
+        warn_deprecated_command_line_argument("--ndevices",
+                                              "--kokkos-num-devices");
       }
-      if (check_arg(arg[iarg], "--kokkos-ndevices")) {
+      if (check_arg(argv[iarg], "--kokkos-ndevices")) {
         warn_deprecated_command_line_argument("--kokkos-ndevices",
                                               "--kokkos-num-devices");
       }
+      warn_deprecated_command_line_argument(
+          "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank");
       // Find the number of device (expecting --device=XX)
-      if (!((strncmp(arg[iarg], "--kokkos-num-devices=", 21) == 0) ||
-            (strncmp(arg[iarg], "--num-ndevices=", 14) == 0) ||
-            (strncmp(arg[iarg], "--kokkos-ndevices=", 18) == 0) ||
-            (strncmp(arg[iarg], "--ndevices=", 11) == 0)))
+      if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) ||
+            (strncmp(argv[iarg], "--num-devices=", 14) == 0) ||
+            (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) ||
+            (strncmp(argv[iarg], "--ndevices=", 11) == 0)))
         throw_runtime_exception(
             "Error: expecting an '=INT[,INT]' after command line argument "
-            "'--num-devices/--kokkos-num-devices'. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
+            "'--kokkos-num-devices'."
+            " Raised by Kokkos::initialize().");
 
-      char* num1      = strchr(arg[iarg], '=') + 1;
+      char* num1      = strchr(argv[iarg], '=') + 1;
       char* num2      = strpbrk(num1, ",");
       int num1_len    = num2 == nullptr ? strlen(num1) : num2 - num1;
       char* num1_only = new char[num1_len + 1];
@@ -685,400 +883,269 @@ void parse_command_line_arguments(int& narg, char* arg[],
       if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) {
         throw_runtime_exception(
             "Error: expecting an integer number after command line argument "
-            "'--kokkos-numdevices'. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
+            "'--kokkos-num-devices'."
+            " Raised by Kokkos::initialize().");
+      }
+      if (check_arg(argv[iarg], "--kokkos-num-devices") ||
+          check_arg(argv[iarg], "--kokkos-ndevices")) {
+        num_devices = std::stoi(num1_only);
+        settings.set_num_devices(num_devices);
+        settings.set_map_device_id_by("mpi_rank");
       }
-      if (check_arg(arg[iarg], "--kokkos-num-devices") ||
-          check_arg(arg[iarg], "--kokkos-ndevices") || !kokkos_ndevices_found)
-        ndevices = std::stoi(num1_only);
       delete[] num1_only;
 
       if (num2 != nullptr) {
         if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1))
           throw_runtime_exception(
               "Error: expecting an integer number after command line argument "
-              "'--kokkos-num-devices=XX,'. Raised by "
-              "Kokkos::initialize(int narg, char* argc[]).");
+              "'--kokkos-num-devices=XX,'."
+              " Raised by Kokkos::initialize().");
 
-        if (check_arg(arg[iarg], "--kokkos-num-devices") ||
-            check_arg(arg[iarg], "--kokkos-ndevices") || !kokkos_ndevices_found)
+        if (check_arg(argv[iarg], "--kokkos-num-devices") ||
+            check_arg(argv[iarg], "--kokkos-ndevices")) {
           skip_device = std::stoi(num2 + 1);
-      }
-
-      // Remove the --kokkos-num-devices argument from the list but leave
-      // --num-devices
-      if (check_arg(arg[iarg], "--kokkos-num-devices") ||
-          check_arg(arg[iarg], "--kokkos-ndevices")) {
-        for (int k = iarg; k < narg - 1; k++) {
-          arg[k] = arg[k + 1];
+          settings.set_skip_device(skip_device);
         }
-        kokkos_ndevices_found = true;
-        narg--;
-      } else {
-        iarg++;
-      }
-    } else if (check_arg(arg[iarg], "--kokkos-disable-warnings")) {
-      disable_warnings = true;
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
       }
-      narg--;
-    } else if (check_arg(arg[iarg], "--kokkos-tune-internals")) {
-      tune_internals = true;
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+    } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings",
+                              disable_warnings)) {
+      settings.set_disable_warnings(disable_warnings);
+      remove_flag = true;
+    } else if (check_arg_bool(argv[iarg], "--kokkos-print-configuration",
+                              print_configuration)) {
+      settings.set_print_configuration(print_configuration);
+      remove_flag = true;
+    } else if (check_arg_bool(argv[iarg], "--kokkos-tune-internals",
+                              tune_internals)) {
+      settings.set_tune_internals(tune_internals);
+      remove_flag = true;
+    } else if (check_arg(argv[iarg], "--kokkos-help") ||
+               check_arg(argv[iarg], "--help")) {
+      help_flag   = true;
+      remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0;
+    } else if (check_arg_str(argv[iarg], "--kokkos-map-device-id-by",
+                             map_device_id_by)) {
+      if (!is_valid_map_device_id_by(map_device_id_by)) {
+        std::stringstream ss;
+        ss << "Warning: command line argument '--kokkos-map-device-id-by="
+           << map_device_id_by << "' is not recognized."
+           << " Raised by Kokkos::initialize().\n";
+        Kokkos::abort(ss.str().c_str());
       }
-      narg--;
-    } else if (check_arg(arg[iarg], "--kokkos-help") ||
-               check_arg(arg[iarg], "--help")) {
-      auto const help_message = R"(
-      --------------------------------------------------------------------------------
-      -------------Kokkos command line arguments--------------------------------------
-      --------------------------------------------------------------------------------
-      The following arguments exist also without prefix 'kokkos' (e.g. --help).
-      The prefixed arguments will be removed from the list by Kokkos::initialize(),
-      the non-prefixed ones are not removed. Prefixed versions take precedence over
-      non prefixed ones, and the last occurrence of an argument overwrites prior
-      settings.
-
-      --kokkos-help                  : print this message
-      --kokkos-disable-warnings      : disable kokkos warning messages
-      --kokkos-tune-internals        : allow Kokkos to autotune policies and declare
-                                       tuning features through the tuning system. If
-                                       left off, Kokkos uses heuristics
-      --kokkos-threads=INT           : specify total number of threads or
-                                       number of threads per NUMA region if
-                                       used in conjunction with '--numa' option.
-      --kokkos-numa=INT              : specify number of NUMA regions used by process.
-      --kokkos-device-id=INT         : specify device id to be used by Kokkos.
-      --kokkos-num-devices=INT[,INT] : used when running MPI jobs. Specify number of
-                                       devices per node to be used. Process to device
-                                       mapping happens by obtaining the local MPI rank
-                                       and assigning devices round-robin. The optional
-                                       second argument allows for an existing device
-                                       to be ignored. This is most useful on workstations
-                                       with multiple GPUs of which one is used to drive
-                                       screen output.
-      --kokkos-tools-library         : Equivalent to KOKKOS_PROFILE_LIBRARY environment
-                                       variable. Must either be full path to library or
-                                       name of library if the path is present in the
-                                       runtime library search path (e.g. LD_LIBRARY_PATH)
-      --kokkos-tools-help            : Query the (loaded) kokkos-tool for its command-line
-                                       option support (which should then be passed via
-                                       --kokkos-tools-args="...")
-      --kokkos-tools-args=STR        : A single (quoted) string of options which will be
-                                       whitespace delimited and passed to the loaded
-                                       kokkos-tool as command-line arguments. E.g.
-                                       `<EXE> --kokkos-tools-args="-c input.txt"` will
-                                       pass `<EXE> -c input.txt` as argc/argv to tool
-      --------------------------------------------------------------------------------
-)";
-      std::cout << help_message << std::endl;
+      settings.set_map_device_id_by(map_device_id_by);
+      remove_flag = true;
+    } else if (std::regex_match(argv[iarg],
+                                std::regex("-?-kokkos.*", std::regex::egrep))) {
+      warn_not_recognized_command_line_argument(argv[iarg]);
+    }
 
-      // Remove the --kokkos-help argument from the list but leave --help
-      if (check_arg(arg[iarg], "--kokkos-help")) {
-        for (int k = iarg; k < narg - 1; k++) {
-          arg[k] = arg[k + 1];
-        }
-        narg--;
-      } else {
-        iarg++;
+    if (remove_flag) {
+      // Shift the remainder of the argv list by one.  Note that argv has
+      // (argc + 1) arguments, the last one always being nullptr.  The following
+      // loop moves the trailing nullptr element as well
+      for (int k = iarg; k < argc; ++k) {
+        argv[k] = argv[k + 1];
       }
-    } else
+      argc--;
+    } else {
       iarg++;
+    }
+  }
+
+  if (help_flag) {
+    print_help_message();
   }
+
   if ((tools_init_arguments.args ==
        Kokkos::Tools::InitArguments::unset_string_option) &&
-      narg > 0)
-    tool_args = arg[0];
+      argc > 0) {
+    settings.set_tools_args(argv[0]);
+  }
 }
 
-void parse_environment_variables(InitArguments& arguments) {
-  auto& num_threads      = arguments.num_threads;
-  auto& numa             = arguments.num_numa;
-  auto& device           = arguments.device_id;
-  auto& ndevices         = arguments.ndevices;
-  auto& skip_device      = arguments.skip_device;
-  auto& disable_warnings = arguments.disable_warnings;
-  auto& tune_internals   = arguments.tune_internals;
-  auto& tool_lib         = arguments.tool_lib;
-  auto& tool_args        = arguments.tool_args;
-  auto& tool_help        = arguments.tool_help;
-  char* endptr;
-
-  auto tools_init_arguments = arguments.impl_get_tools_init_arguments();
+void Kokkos::Impl::parse_environment_variables(
+    InitializationSettings& settings) {
+  Tools::InitArguments tools_init_arguments;
+  combine(tools_init_arguments, settings);
   auto init_result =
       Tools::Impl::parse_environment_variables(tools_init_arguments);
-  if (init_result.result == Kokkos::Tools::Impl::InitializationStatus::
-                                environment_argument_mismatch) {
+  if (init_result.result ==
+      Tools::Impl::InitializationStatus::environment_argument_mismatch) {
     Impl::throw_runtime_exception(init_result.error_message);
   }
+  combine(settings, tools_init_arguments);
 
-  tool_lib = tools_init_arguments.lib;
-
-  if (tools_init_arguments.tune_internals !=
-      Kokkos::Tools::InitArguments::PossiblyUnsetOption::unset) {
-    tune_internals = (tools_init_arguments.tune_internals ==
-                      Kokkos::Tools::InitArguments::PossiblyUnsetOption::on)
-                         ? true
-                         : false;
-  }
-  if (tools_init_arguments.help !=
-      Kokkos::Tools::InitArguments::PossiblyUnsetOption::unset) {
-    tool_help = (tools_init_arguments.help ==
-                 Kokkos::Tools::InitArguments::PossiblyUnsetOption::on)
-                    ? true
-                    : false;
+  if (std::getenv("KOKKOS_NUMA")) {
+    warn_deprecated_environment_variable("KOKKOS_NUMA");
   }
-  if (tools_init_arguments.lib !=
-      Kokkos::Tools::InitArguments::unset_string_option) {
-    tool_lib = tools_init_arguments.lib;
+  int num_threads;
+  if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) {
+    if (!is_valid_num_threads(num_threads)) {
+      std::stringstream ss;
+      ss << "Error: environment variable 'KOKKOS_NUM_THREADS=" << num_threads
+         << "' is invalid."
+         << " The number of threads must be greater than or equal to one."
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
+    }
+    settings.set_num_threads(num_threads);
   }
-  if (tools_init_arguments.args !=
-      Kokkos::Tools::InitArguments::unset_string_option) {
-    tool_args = tools_init_arguments.args;
+  int device_id;
+  if (check_env_int("KOKKOS_DEVICE_ID", device_id)) {
+    if (!is_valid_device_id(device_id)) {
+      std::stringstream ss;
+      ss << "Error: environment variable 'KOKKOS_DEVICE_ID" << device_id
+         << "' is invalid."
+         << " The device id must be greater than or equal to zero."
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
+    }
+    settings.set_device_id(device_id);
   }
-
-  auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS");
-  if (env_num_threads_str != nullptr) {
-    errno                = 0;
-    auto env_num_threads = std::strtol(env_num_threads_str, &endptr, 10);
-    if (endptr == env_num_threads_str)
-      Impl::throw_runtime_exception(
-          "Error: cannot convert KOKKOS_NUM_THREADS to an integer. Raised by "
-          "Kokkos::initialize(int narg, char* argc[]).");
-    if (errno == ERANGE)
-      Impl::throw_runtime_exception(
-          "Error: KOKKOS_NUM_THREADS out of range of representable values by "
-          "an integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
-    if ((num_threads != -1) && (env_num_threads != num_threads))
-      Impl::throw_runtime_exception(
-          "Error: expecting a match between --kokkos-threads and "
-          "KOKKOS_NUM_THREADS if both are set. Raised by "
-          "Kokkos::initialize(int narg, char* argc[]).");
-    else
-      num_threads = env_num_threads;
+  int num_devices;
+  int rand_devices;
+  bool has_num_devices  = check_env_int("KOKKOS_NUM_DEVICES", num_devices);
+  bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices);
+  if (has_rand_devices && has_num_devices) {
+    Impl::throw_runtime_exception(
+        "Error: cannot specify both KOKKOS_NUM_DEVICES and "
+        "KOKKOS_RAND_DEVICES."
+        " Raised by Kokkos::initialize().");
   }
-  auto env_numa_str = std::getenv("KOKKOS_NUMA");
-  if (env_numa_str != nullptr) {
-    errno         = 0;
-    auto env_numa = std::strtol(env_numa_str, &endptr, 10);
-    if (endptr == env_numa_str)
-      Impl::throw_runtime_exception(
-          "Error: cannot convert KOKKOS_NUMA to an integer. Raised by "
-          "Kokkos::initialize(int narg, char* argc[]).");
-    if (errno == ERANGE)
-      Impl::throw_runtime_exception(
-          "Error: KOKKOS_NUMA out of range of representable values by an "
-          "integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
-    if ((numa != -1) && (env_numa != numa))
-      Impl::throw_runtime_exception(
-          "Error: expecting a match between --kokkos-numa and KOKKOS_NUMA if "
-          "both are set. Raised by Kokkos::initialize(int narg, char* "
-          "argc[]).");
-    else
-      numa = env_numa;
+  if (has_num_devices) {
+    warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES",
+                                         "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank");
+    settings.set_map_device_id_by("mpi_rank");
+    settings.set_num_devices(num_devices);
   }
-  auto env_device_str = std::getenv("KOKKOS_DEVICE_ID");
-  if (env_device_str != nullptr) {
-    errno           = 0;
-    auto env_device = std::strtol(env_device_str, &endptr, 10);
-    if (endptr == env_device_str)
-      Impl::throw_runtime_exception(
-          "Error: cannot convert KOKKOS_DEVICE_ID to an integer. Raised by "
-          "Kokkos::initialize(int narg, char* argc[]).");
-    if (errno == ERANGE)
-      Impl::throw_runtime_exception(
-          "Error: KOKKOS_DEVICE_ID out of range of representable values by an "
-          "integer. Raised by Kokkos::initialize(int narg, char* argc[]).");
-    if ((device != -1) && (env_device != device))
-      Impl::throw_runtime_exception(
-          "Error: expecting a match between --kokkos-device and "
-          "KOKKOS_DEVICE_ID if both are set. Raised by Kokkos::initialize(int "
-          "narg, char* argc[]).");
-    else
-      device = env_device;
+  if (has_rand_devices) {
+    warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES",
+                                         "KOKKOS_MAP_DEVICE_ID_BY=random");
+    settings.set_map_device_id_by("random");
+    settings.set_num_devices(rand_devices);
   }
-  auto env_rdevices_str = std::getenv("KOKKOS_RAND_DEVICES");
-  auto env_ndevices_str = std::getenv("KOKKOS_NUM_DEVICES");
-  if (env_ndevices_str != nullptr || env_rdevices_str != nullptr) {
-    errno = 0;
-    if (env_ndevices_str != nullptr && env_rdevices_str != nullptr) {
-      Impl::throw_runtime_exception(
-          "Error: cannot specify both KOKKOS_NUM_DEVICES and "
-          "KOKKOS_RAND_DEVICES. "
-          "Raised by Kokkos::initialize(int narg, char* argc[]).");
-    }
-    int rdevices = -1;
-    if (env_ndevices_str != nullptr) {
-      auto env_ndevices = std::strtol(env_ndevices_str, &endptr, 10);
-      if (endptr == env_ndevices_str)
-        Impl::throw_runtime_exception(
-            "Error: cannot convert KOKKOS_NUM_DEVICES to an integer. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
-      if (errno == ERANGE)
-        Impl::throw_runtime_exception(
-            "Error: KOKKOS_NUM_DEVICES out of range of representable values by "
-            "an integer. Raised by Kokkos::initialize(int narg, char* "
-            "argc[]).");
-      if ((ndevices != -1) && (env_ndevices != ndevices))
-        Impl::throw_runtime_exception(
-            "Error: expecting a match between --kokkos-ndevices and "
-            "KOKKOS_NUM_DEVICES if both are set. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
-      else
-        ndevices = env_ndevices;
-    } else {  // you set KOKKOS_RAND_DEVICES
-      auto env_rdevices = std::strtol(env_rdevices_str, &endptr, 10);
-      if (endptr == env_ndevices_str)
-        Impl::throw_runtime_exception(
-            "Error: cannot convert KOKKOS_RAND_DEVICES to an integer. Raised "
-            "by Kokkos::initialize(int narg, char* argc[]).");
-      if (errno == ERANGE)
-        Impl::throw_runtime_exception(
-            "Error: KOKKOS_RAND_DEVICES out of range of representable values "
-            "by an integer. Raised by Kokkos::initialize(int narg, char* "
-            "argc[]).");
-      else
-        rdevices = env_rdevices;
-    }
-    // Skip device
-    auto env_skip_device_str = std::getenv("KOKKOS_SKIP_DEVICE");
-    if (env_skip_device_str != nullptr) {
-      errno                = 0;
-      auto env_skip_device = std::strtol(env_skip_device_str, &endptr, 10);
-      if (endptr == env_skip_device_str)
-        Impl::throw_runtime_exception(
-            "Error: cannot convert KOKKOS_SKIP_DEVICE to an integer. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
-      if (errno == ERANGE)
-        Impl::throw_runtime_exception(
-            "Error: KOKKOS_SKIP_DEVICE out of range of representable values by "
-            "an integer. Raised by Kokkos::initialize(int narg, char* "
-            "argc[]).");
-      if ((skip_device != 9999) && (env_skip_device != skip_device))
-        Impl::throw_runtime_exception(
-            "Error: expecting a match between --kokkos-ndevices and "
-            "KOKKOS_SKIP_DEVICE if both are set. Raised by "
-            "Kokkos::initialize(int narg, char* argc[]).");
-      else
-        skip_device = env_skip_device;
-    }
-    if (rdevices > 0) {
-      if (skip_device > 0 && rdevices == 1)
-        Impl::throw_runtime_exception(
-            "Error: cannot KOKKOS_SKIP_DEVICE the only KOKKOS_RAND_DEVICE. "
-            "Raised by Kokkos::initialize(int narg, char* argc[]).");
-
-      std::srand(get_process_id());
-      while (device < 0) {
-        int test_device = std::rand() % rdevices;
-        if (test_device != skip_device) device = test_device;
-      }
+  if (has_num_devices || has_rand_devices) {
+    int skip_device;
+    if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) {
+      settings.set_skip_device(skip_device);
     }
   }
-  char* env_disablewarnings_str = std::getenv("KOKKOS_DISABLE_WARNINGS");
-  if (env_disablewarnings_str != nullptr) {
-    std::string env_str(env_disablewarnings_str);  // deep-copies string
-    for (char& c : env_str) {
-      c = toupper(c);
+  bool disable_warnings;
+  if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) {
+    settings.set_disable_warnings(disable_warnings);
+  }
+  bool print_configuration;
+  if (check_env_bool("KOKKOS_PRINT_CONFIGURATION", print_configuration)) {
+    settings.set_print_configuration(print_configuration);
+  }
+  bool tune_internals;
+  if (check_env_bool("KOKKOS_TUNE_INTERNALS", tune_internals)) {
+    settings.set_tune_internals(tune_internals);
+  }
+  char const* map_device_id_by = std::getenv("KOKKOS_MAP_DEVICE_ID_BY");
+  if (map_device_id_by != nullptr) {
+    if (std::getenv("KOKKOS_DEVICE_ID")) {
+      std::cerr << "Warning: environment variable KOKKOS_MAP_DEVICE_ID_BY"
+                << "ignored since KOKKOS_DEVICE_ID is specified."
+                << " Raised by Kokkos::initialize()." << std::endl;
+    }
+    if (!is_valid_map_device_id_by(map_device_id_by)) {
+      std::stringstream ss;
+      ss << "Warning: environment variable 'KOKKOS_MAP_DEVICE_ID_BY="
+         << map_device_id_by << "' is not recognized."
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
     }
-    const auto _rc = std::regex_constants::icase | std::regex_constants::egrep;
-    const auto _re = std::regex("^(true|on|yes|[1-9])$", _rc);
-    if (std::regex_match(env_str, _re))
-      disable_warnings = true;
-    else if (disable_warnings)
-      Impl::throw_runtime_exception(
-          "Error: expecting a match between --kokkos-disable-warnings and "
-          "KOKKOS_DISABLE_WARNINGS if both are set. Raised by "
-          "Kokkos::initialize(int narg, char* argc[]).");
+    settings.set_map_device_id_by(map_device_id_by);
   }
 }
 
-}  // namespace
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-void initialize(int& narg, char* arg[]) {
-  InitArguments arguments;
-  Impl::parse_command_line_arguments(narg, arg, arguments);
-  Impl::parse_environment_variables(arguments);
-  Impl::initialize_internal(arguments);
+void Kokkos::initialize(int& argc, char* argv[]) {
+  InitializationSettings settings;
+  Impl::parse_environment_variables(settings);
+  Impl::parse_command_line_arguments(argc, argv, settings);
+  initialize_internal(settings);
 }
 
-void initialize(InitArguments arguments) {
-  Impl::parse_environment_variables(arguments);
-  Impl::initialize_internal(arguments);
+void Kokkos::initialize(InitializationSettings const& settings) {
+  InitializationSettings tmp;
+  Impl::parse_environment_variables(tmp);
+  combine(tmp, settings);
+  initialize_internal(tmp);
 }
 
-namespace Impl {
-
-void pre_initialize(const InitArguments& args) {
-  pre_initialize_internal(args);
+void Kokkos::Impl::pre_initialize(const InitializationSettings& settings) {
+  pre_initialize_internal(settings);
 }
 
-void post_initialize(const InitArguments& args) {
-  post_initialize_internal(args);
+void Kokkos::Impl::post_initialize(const InitializationSettings& settings) {
+  post_initialize_internal(settings);
 }
 
-}  // namespace Impl
-
-void push_finalize_hook(std::function<void()> f) { finalize_hooks.push(f); }
+void Kokkos::push_finalize_hook(std::function<void()> f) {
+  finalize_hooks.push(f);
+}
 
-void finalize() { Impl::finalize_internal(); }
+void Kokkos::finalize() { finalize_internal(); }
 
-void finalize_all() {
-  enum : bool { all_spaces = true };
-  Impl::finalize_internal(all_spaces);
-}
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+KOKKOS_DEPRECATED void Kokkos::finalize_all() { finalize_internal(); }
+#endif
 
-void fence() { Impl::fence_internal("Kokkos::fence: Unnamed Global Fence"); }
-void fence(const std::string& name) { Impl::fence_internal(name); }
+#ifdef KOKKOS_COMPILER_INTEL
+void Kokkos::fence() { fence("Kokkos::fence: Unnamed Global Fence"); }
+#endif
+void Kokkos::fence(const std::string& name) { fence_internal(name); }
 
-void print_helper(std::ostringstream& out,
+namespace {
+void print_helper(std::ostream& os,
                   const std::map<std::string, std::string>& print_me) {
   for (const auto& kv : print_me) {
-    out << kv.first << ": " << kv.second << '\n';
+    os << kv.first << ": " << kv.second << '\n';
   }
 }
+}  // namespace
 
-void print_configuration(std::ostream& out, const bool detail) {
-  std::ostringstream msg;
+void Kokkos::print_configuration(std::ostream& os, bool verbose) {
+  print_helper(os, metadata_map["version_info"]);
 
-  print_helper(msg, Kokkos::Impl::metadata_map["version_info"]);
+  os << "Compiler:\n";
+  print_helper(os, metadata_map["compiler_version"]);
 
-  msg << "Compiler:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["compiler_version"]);
+  os << "Architecture:\n";
+  print_helper(os, metadata_map["architecture"]);
 
-  msg << "Architecture:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["architecture"]);
+  os << "Atomics:\n";
+  print_helper(os, metadata_map["atomics"]);
 
-  msg << "Atomics:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["atomics"]);
+  os << "Vectorization:\n";
+  print_helper(os, metadata_map["vectorization"]);
 
-  msg << "Vectorization:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["vectorization"]);
+  os << "Memory:\n";
+  print_helper(os, metadata_map["memory"]);
 
-  msg << "Memory:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["memory"]);
+  os << "Options:\n";
+  print_helper(os, metadata_map["options"]);
 
-  msg << "Options:" << std::endl;
-  print_helper(msg, Kokkos::Impl::metadata_map["options"]);
+  Impl::ExecSpaceManager::get_instance().print_configuration(os, verbose);
+}
 
-  Impl::ExecSpaceManager::get_instance().print_configuration(msg, detail);
+KOKKOS_ATTRIBUTE_NODISCARD bool Kokkos::is_initialized() noexcept {
+  return g_is_initialized;
+}
 
-  out << msg.str() << std::endl;
+KOKKOS_ATTRIBUTE_NODISCARD bool Kokkos::is_finalized() noexcept {
+  return g_is_finalized;
 }
 
-bool is_initialized() noexcept { return g_is_initialized; }
+bool Kokkos::show_warnings() noexcept { return g_show_warnings; }
+
+bool Kokkos::tune_internals() noexcept { return g_tune_internals; }
 
-bool show_warnings() noexcept { return g_show_warnings; }
-bool tune_internals() noexcept { return g_tune_internals; }
+namespace Kokkos {
 
 #ifdef KOKKOS_COMPILER_PGI
 namespace Impl {
@@ -1088,5 +1155,8 @@ namespace Impl {
 void _kokkos_pgi_compiler_bug_workaround() {}
 }  // end namespace Impl
 #endif
-
 }  // namespace Kokkos
+
+Kokkos::Impl::InitializationSettingsHelper<std::string>::storage_type const
+    Kokkos::Impl::InitializationSettingsHelper<std::string>::unspecified =
+        "some string we don't expect user would ever provide";
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp
similarity index 75%
rename from packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
rename to packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp
index 1a0b10e40..34421f0fe 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp
@@ -42,26 +42,21 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_EXEC_SPACE_INITIALIZER_HPP
-#define KOKKOS_EXEC_SPACE_INITIALIZER_HPP
+#ifndef KOKKOS_DEVICE_MANAGEMENT_HPP
+#define KOKKOS_DEVICE_MANAGEMENT_HPP
 
-#include <iosfwd>
+#include <vector>
 
 namespace Kokkos {
+class InitializationSettings;
 namespace Impl {
-
-class ExecSpaceInitializerBase {
- public:
-  virtual void initialize(const InitArguments &args)                     = 0;
-  virtual void finalize(const bool all_spaces)                           = 0;
-  virtual void fence()                                                   = 0;
-  virtual void fence(const std::string &)                                = 0;
-  virtual void print_configuration(std::ostream &msg, const bool detail) = 0;
-  ExecSpaceInitializerBase()          = default;
-  virtual ~ExecSpaceInitializerBase() = default;
-};
-
+int get_gpu(const Kokkos::InitializationSettings& settings);
+// This declaration is provided for testing purposes only
+int get_ctest_gpu(const char* local_rank_str);
+// ditto
+std::vector<int> get_visible_devices(
+    Kokkos::InitializationSettings const& settings, int device_count);
 }  // namespace Impl
 }  // namespace Kokkos
 
-#endif  // KOKKOS_EXEC_SPACE_INITIALIZER_HPP
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
index dc8e5e4d8..87d6c044a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
@@ -79,21 +79,19 @@ struct EBOBaseImpl;
 
 template <class T, template <class...> class CtorNotOnDevice>
 struct EBOBaseImpl<T, true, CtorNotOnDevice> {
-  template <
-      class... Args, class _ignored = void,
-      typename std::enable_if<std::is_void<_ignored>::value &&
-                                  std::is_constructible<T, Args...>::value &&
-                                  !CtorNotOnDevice<Args...>::value,
-                              int>::type = 0>
+  template <class... Args, class _ignored = void,
+            std::enable_if_t<std::is_void<_ignored>::value &&
+                                 std::is_constructible<T, Args...>::value &&
+                                 !CtorNotOnDevice<Args...>::value,
+                             int> = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
       Args&&...) noexcept {}
 
-  template <
-      class... Args, class _ignored = void,
-      typename std::enable_if<std::is_void<_ignored>::value &&
-                                  std::is_constructible<T, Args...>::value &&
-                                  CtorNotOnDevice<Args...>::value,
-                              long>::type = 0>
+  template <class... Args, class _ignored = void,
+            std::enable_if_t<std::is_void<_ignored>::value &&
+                                 std::is_constructible<T, Args...>::value &&
+                                 CtorNotOnDevice<Args...>::value,
+                             long> = 0>
   inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {}
 
   KOKKOS_DEFAULTED_FUNCTION
@@ -139,22 +137,20 @@ template <class T, template <class...> class CTorsNotOnDevice>
 struct EBOBaseImpl<T, false, CTorsNotOnDevice> {
   T m_ebo_object;
 
-  template <
-      class... Args, class _ignored = void,
-      typename std::enable_if<std::is_void<_ignored>::value &&
-                                  !CTorsNotOnDevice<Args...>::value &&
-                                  std::is_constructible<T, Args...>::value,
-                              int>::type = 0>
+  template <class... Args, class _ignored = void,
+            std::enable_if_t<std::is_void<_ignored>::value &&
+                                 !CTorsNotOnDevice<Args...>::value &&
+                                 std::is_constructible<T, Args...>::value,
+                             int> = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
       Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
       : m_ebo_object(std::forward<Args>(args)...) {}
 
-  template <
-      class... Args, class _ignored = void,
-      typename std::enable_if<std::is_void<_ignored>::value &&
-                                  CTorsNotOnDevice<Args...>::value &&
-                                  std::is_constructible<T, Args...>::value,
-                              long>::type = 0>
+  template <class... Args, class _ignored = void,
+            std::enable_if_t<std::is_void<_ignored>::value &&
+                                 CTorsNotOnDevice<Args...>::value &&
+                                 std::is_constructible<T, Args...>::value,
+                             long> = 0>
   inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept(
       noexcept(T(std::forward<Args>(args)...)))
       : m_ebo_object(std::forward<Args>(args)...) {}
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
index a28d00858..750228331 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <cstring>
 #include <cstdlib>
 
@@ -138,6 +142,9 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::CudaHostAlloc: o << "cudaHostAlloc()."; break;
     case AllocationMechanism::HIPMalloc: o << "hipMalloc()."; break;
     case AllocationMechanism::HIPHostMalloc: o << "hipHostMalloc()."; break;
+    case AllocationMechanism::HIPMallocManaged:
+      o << "hipMallocManaged().";
+      break;
     case AllocationMechanism::SYCLMallocDevice:
       o << "sycl::malloc_device().";
       break;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
index 5d7c60fba..63b40f297 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -67,12 +67,65 @@ namespace Impl {
 
 [[noreturn]] void host_abort(const char *const);
 
-void throw_runtime_exception(const std::string &);
+#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
+
+#if defined(__APPLE__) || defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
+// cuda_abort does not abort when building for macOS.
+// required to workaround failures in random number generator unit tests with
+// pre-volta architectures
+#define KOKKOS_IMPL_ABORT_NORETURN
+#else
+// cuda_abort aborts when building for other platforms than macOS
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#endif
+
+#elif defined(KOKKOS_COMPILER_NVHPC)
+
+#define KOKKOS_IMPL_ABORT_NORETURN
+
+#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
+// HIP aborts
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// FIXME_SYCL SYCL doesn't abort
+#define KOKKOS_IMPL_ABORT_NORETURN
+#elif !defined(KOKKOS_ENABLE_OPENMPTARGET)
+// Host aborts
+#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
+#else
+// Everything else does not abort
+#define KOKKOS_IMPL_ABORT_NORETURN
+#endif
+
+#ifdef KOKKOS_ENABLE_SYCL  // FIXME_SYCL
+#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE
+#else
+#define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN
+#endif
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void
+device_abort(const char *const msg) {
+#if defined(KOKKOS_ENABLE_CUDA)
+  ::Kokkos::Impl::cuda_abort(msg);
+#elif defined(KOKKOS_ENABLE_HIP)
+  ::Kokkos::Impl::hip_abort(msg);
+#elif defined(KOKKOS_ENABLE_SYCL)
+  ::Kokkos::Impl::sycl_abort(msg);
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+  printf("%s", msg);  // FIXME_OPENMPTARGET
+#else
+#error faulty logic
+#endif
+}
+#endif
+
+[[noreturn]] void throw_runtime_exception(const std::string &msg);
 
 void traceback_callstack(std::ostream &);
 
 std::string human_memory_size(size_t arg_bytes);
-void throw_runtime_exception(const std::string &msg);
 
 }  // namespace Impl
 
@@ -97,6 +150,7 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     CudaHostAlloc,
     HIPMalloc,
     HIPHostMalloc,
+    HIPMallocManaged,
     SYCLMallocDevice,
     SYCLMallocShared,
     SYCLMallocHost
@@ -171,48 +225,16 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
-
-#if defined(__APPLE__) || defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-// cuda_abort does not abort when building for macOS.
-// required to workaround failures in random number generator unit tests with
-// pre-volta architectures
-#define KOKKOS_IMPL_ABORT_NORETURN
-#else
-// cuda_abort aborts when building for other platforms than macOS
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#endif
-
-#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
-// HIP aborts
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-// FIXME_SYCL SYCL doesn't abort
-#define KOKKOS_IMPL_ABORT_NORETURN
-#elif !defined(KOKKOS_ENABLE_OPENMPTARGET)
-// Host aborts
-#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]]
-#else
-// Everything else does not abort
-#define KOKKOS_IMPL_ABORT_NORETURN
-#endif
-
 namespace Kokkos {
+
 KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
     const char *const message) {
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
-  Kokkos::Impl::cuda_abort(message);
-#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)
-  Kokkos::Impl::hip_abort(message);
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-  Kokkos::Impl::sycl_abort(message);
-#elif !defined(KOKKOS_ENABLE_OPENMPTARGET)
-  Kokkos::Impl::host_abort(message);
-#else
-  (void)message;  // FIXME_OPENMPTARGET
-#endif
+  KOKKOS_IF_ON_HOST(::Kokkos::Impl::host_abort(message);)
+  KOKKOS_IF_ON_DEVICE(::Kokkos::Impl::device_abort(message);)
 }
 
+#undef KOKKOS_IMPL_ABORT_NORETURN
+
 }  // namespace Kokkos
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
index 1c337b957..75b89c73a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
@@ -42,19 +42,25 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 #include <sstream>
 
 namespace Kokkos {
 namespace Impl {
-PerTeamValue::PerTeamValue(int arg) : value(arg) {}
+PerTeamValue::PerTeamValue(size_t arg) : value(arg) {}
 
-PerThreadValue::PerThreadValue(int arg) : value(arg) {}
+PerThreadValue::PerThreadValue(size_t arg) : value(arg) {}
 }  // namespace Impl
 
-Impl::PerTeamValue PerTeam(const int& arg) { return Impl::PerTeamValue(arg); }
+Impl::PerTeamValue PerTeam(const size_t& arg) {
+  return Impl::PerTeamValue(arg);
+}
 
-Impl::PerThreadValue PerThread(const int& arg) {
+Impl::PerThreadValue PerThread(const size_t& arg) {
   return Impl::PerThreadValue(arg);
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
new file mode 100644
index 000000000..354bdde9b
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXEC_SPACE_MANAGER_HPP
+#define KOKKOS_EXEC_SPACE_MANAGER_HPP
+
+#include <impl/Kokkos_InitializationSettings.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
+#include <Kokkos_Concepts.hpp>
+
+#include <iosfwd>
+#include <map>
+#include <string>
+
+namespace {
+
+template <class T>
+using public_member_types_t = std::enable_if_t<
+    Kokkos::is_execution_space<typename T::execution_space>::value &&
+    Kokkos::is_memory_space<typename T::memory_space>::value &&
+    Kokkos::is_device<typename T::device_type>::value &&
+    Kokkos::is_array_layout<typename T::array_layout>::value &&
+    std::is_integral<typename T::size_type>::value &&
+    Kokkos::is_memory_space<typename T::scratch_memory_space>::value>;
+
+template <class T>
+using print_configuration_t = std::enable_if_t<
+    std::is_void<decltype(std::declval<T const&>().print_configuration(
+        std::declval<std::ostream&>()))>::value &&
+    std::is_void<decltype(std::declval<T const&>().print_configuration(
+        std::declval<std::ostream&>(), false))>::value>;
+
+template <class T>
+using initialize_finalize_t = std::enable_if_t<
+    std::is_void<decltype(T::impl_initialize(
+        std::declval<Kokkos::InitializationSettings const&>()))>::value &&
+    std::is_void<decltype(T::impl_finalize())>::value>;
+
+template <class T>
+using fence_t = std::enable_if_t<
+    std::is_void<decltype(std::declval<T const&>().fence())>::value &&
+    std::is_void<decltype(std::declval<T const&>().fence("name"))>::value &&
+    std::is_void<decltype(T::impl_static_fence("name"))>::value>;
+
+#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, "")  // FIXME C++17
+
+template <class ExecutionSpace>
+constexpr bool check_valid_execution_space() {
+  using Kokkos::is_detected;
+  STATIC_ASSERT(std::is_default_constructible<ExecutionSpace>::value);
+  STATIC_ASSERT(is_detected<public_member_types_t, ExecutionSpace>::value);
+  STATIC_ASSERT(is_detected<print_configuration_t, ExecutionSpace>::value);
+  STATIC_ASSERT(is_detected<initialize_finalize_t, ExecutionSpace>::value);
+  STATIC_ASSERT(is_detected<fence_t, ExecutionSpace>::value);
+#ifndef KOKKOS_ENABLE_HPX  // FIXME_HPX
+  STATIC_ASSERT(sizeof(ExecutionSpace) <= 2 * sizeof(void*));
+#endif
+  return true;
+}
+
+#undef STATIC_ASSERT
+
+}  // namespace
+
+namespace Kokkos {
+namespace Impl {
+
+struct ExecSpaceBase {
+  virtual void initialize(InitializationSettings const&)           = 0;
+  virtual void finalize()                                          = 0;
+  virtual void static_fence(std::string const&)                    = 0;
+  virtual void print_configuration(std::ostream& os, bool verbose) = 0;
+  virtual ~ExecSpaceBase()                                         = default;
+};
+
+template <class ExecutionSpace>
+struct ExecSpaceDerived : ExecSpaceBase {
+  static_assert(check_valid_execution_space<ExecutionSpace>(), "");
+  void initialize(InitializationSettings const& settings) final {
+    ExecutionSpace::impl_initialize(settings);
+  }
+  void finalize() final { ExecutionSpace::impl_finalize(); }
+  void static_fence(std::string const& label) final {
+    ExecutionSpace::impl_static_fence(label);
+  }
+  void print_configuration(std::ostream& os, bool verbose) final {
+    ExecutionSpace().print_configuration(os, verbose);
+  }
+};
+
+/* ExecSpaceManager - Responsible for initializing all the registered
+ * backends. Backends are registered using the register_space_initializer()
+ * function which should be called from a global context so that it is called
+ * prior to initialize_spaces() which is called from Kokkos::initialize()
+ */
+class ExecSpaceManager {
+  std::map<std::string, std::unique_ptr<ExecSpaceBase>> exec_space_factory_list;
+  ExecSpaceManager() = default;
+
+ public:
+  void register_space_factory(std::string name,
+                              std::unique_ptr<ExecSpaceBase> ptr);
+  void initialize_spaces(const Kokkos::InitializationSettings& settings);
+  void finalize_spaces();
+  void static_fence(const std::string&);
+  void print_configuration(std::ostream& os, bool verbose);
+  static ExecSpaceManager& get_instance();
+};
+
+template <class ExecutionSpace>
+int initialize_space_factory(std::string name) {
+  auto space_ptr = std::make_unique<ExecSpaceDerived<ExecutionSpace>>();
+  ExecSpaceManager::get_instance().register_space_factory(name,
+                                                          std::move(space_ptr));
+  return 1;
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
deleted file mode 100644
index 504fba026..000000000
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ /dev/null
@@ -1,2055 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_FUNCTORADAPTER_HPP
-#define KOKKOS_FUNCTORADAPTER_HPP
-
-#include <cstddef>
-#include <Kokkos_Core_fwd.hpp>
-#include <impl/Kokkos_Traits.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class Enable = void>
-struct ReduceFunctorHasInit {
-  enum : bool { value = false };
-};
-
-// The else clause idiom failed with NVCC+MSVC, causing some symbols not being
-// compiled for the device. The code in there is anyway sketchy, and likely not
-// standard compliant (just happens to work on all compilers we ever used)
-// We intend to replace all of this long term with proper detection idiom.
-#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA)
-template <class>
-using impl_void_t_workaround = void;
-
-template <class F>
-using init_archetype = decltype(&F::init);
-
-template <class FunctorType>
-struct ReduceFunctorHasInit<
-    FunctorType, impl_void_t_workaround<init_archetype<FunctorType>>> {
-  enum : bool { value = true };
-};
-#else
-template <class FunctorType>
-struct ReduceFunctorHasInit<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::init)>::type> {
-  enum : bool { value = true };
-};
-#endif
-
-template <class FunctorType, class Enable = void>
-struct ReduceFunctorHasJoin {
-  enum : bool { value = false };
-};
-
-#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA)
-template <class F>
-using join_archetype = decltype(&F::join);
-
-template <class FunctorType>
-struct ReduceFunctorHasJoin<
-    FunctorType, impl_void_t_workaround<join_archetype<FunctorType>>> {
-  enum : bool { value = true };
-};
-#else
-template <class FunctorType>
-struct ReduceFunctorHasJoin<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::join)>::type> {
-  enum : bool { value = true };
-};
-#endif
-
-template <class FunctorType, class Enable = void>
-struct ReduceFunctorHasFinal {
-  enum : bool { value = false };
-};
-
-#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA)
-template <class F>
-using final_archetype = decltype(&F::final);
-
-template <class FunctorType>
-struct ReduceFunctorHasFinal<
-    FunctorType, impl_void_t_workaround<final_archetype<FunctorType>>> {
-  enum : bool { value = true };
-};
-#else
-template <class FunctorType>
-struct ReduceFunctorHasFinal<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::final)>::type> {
-  enum : bool { value = true };
-};
-#endif
-
-template <class FunctorType, class Enable = void>
-struct ReduceFunctorHasShmemSize {
-  enum : bool { value = false };
-};
-
-#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA)
-template <class F>
-using shmemsize_archetype = decltype(&F::team_shmem_size);
-
-template <class FunctorType>
-struct ReduceFunctorHasShmemSize<
-    FunctorType, impl_void_t_workaround<shmemsize_archetype<FunctorType>>> {
-  enum : bool { value = true };
-};
-#else
-template <class FunctorType>
-struct ReduceFunctorHasShmemSize<
-    FunctorType,
-    typename std::enable_if<0 < sizeof(&FunctorType::team_shmem_size)>::type> {
-  enum : bool { value = true };
-};
-#endif
-
-template <class FunctorType, class ArgTag, class Enable = void>
-struct FunctorDeclaresValueType : public std::false_type {};
-
-template <class FunctorType, class ArgTag>
-struct FunctorDeclaresValueType<FunctorType, ArgTag,
-                                void_t<typename FunctorType::value_type>>
-    : public std::true_type {};
-
-/** \brief  Query Functor and execution policy argument tag for value type.
- *
- *  If C++11 enabled and 'value_type' is not explicitly declared then attempt
- *  to deduce the type from FunctorType::operator().
- */
-template <class FunctorType, class ArgTag,
-          bool Dec = FunctorDeclaresValueType<FunctorType, ArgTag>::value>
-struct FunctorValueTraits {
-  using value_type     = void;
-  using pointer_type   = void;
-  using reference_type = void;
-  using functor_type   = void;
-
-  enum { StaticValueSize = 0 };
-
-  KOKKOS_FORCEINLINE_FUNCTION static unsigned value_count(const FunctorType&) {
-    return 0;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static unsigned value_size(const FunctorType&) {
-    return 0;
-  }
-};
-
-template <class ArgTag>
-struct FunctorValueTraits<void, ArgTag, false> {
-  using value_type     = void;
-  using pointer_type   = void;
-  using reference_type = void;
-  using functor_type   = void;
-};
-
-/** \brief  FunctorType::value_type is explicitly declared so use it.
- *
- * Two options for declaration
- *
- *   1) A plain-old-data (POD) type
- *        using value_type = {pod_type};
- *
- *   2) An array of POD of a runtime specified count.
- *        using value_type = {pod_type}[];
- *        const unsigned     value_count ;
- */
-template <class FunctorType, class ArgTag>
-struct FunctorValueTraits<FunctorType, ArgTag,
-                          true /* == exists FunctorType::value_type */> {
-  using value_type =
-      typename std::remove_extent<typename FunctorType::value_type>::type;
-  using functor_type = FunctorType;
-
-  static_assert((sizeof(value_type) < sizeof(int)) ||
-                    0 == (sizeof(value_type) % sizeof(int)),
-                "Reduction functor's declared value_type requires: 0 == "
-                "sizeof(value_type) % sizeof(int)");
-
-  /* this cast to bool is needed for correctness by NVCC */
-  enum : bool {
-    IsArray = static_cast<bool>(
-        std::is_array<typename FunctorType::value_type>::value)
-  };
-
-  // If not an array then what is the sizeof(value_type)
-  enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
-
-  using pointer_type = value_type*;
-
-  // The reference_type for an array is 'value_type *'
-  // The reference_type for a single value is 'value_type &'
-
-  using reference_type = std::conditional_t<IsArray, value_type*, value_type&>;
-
-  // Number of values if single value
-  template <class F>
-  KOKKOS_FORCEINLINE_FUNCTION static
-      typename std::enable_if<std::is_same<F, FunctorType>::value && !IsArray,
-                              unsigned>::type
-      value_count(const F&) {
-    return 1;
-  }
-
-  // Number of values if an array, protect via templating because
-  // 'f.value_count' will only exist when the functor declares the value_type to
-  // be an array.
-  template <class F>
-  KOKKOS_FORCEINLINE_FUNCTION static
-      typename std::enable_if<std::is_same<F, FunctorType>::value && IsArray,
-                              unsigned>::type
-      value_count(const F& f) {
-    return f.value_count;
-  }
-
-  // Total size of the value
-  KOKKOS_INLINE_FUNCTION static unsigned value_size(const FunctorType& f) {
-    return value_count(f) * sizeof(value_type);
-  }
-};
-
-template <class FunctorType, class ArgTag>
-struct FunctorValueTraits<FunctorType, ArgTag,
-                          false /* == exists FunctorType::value_type */
-                          > {
- private:
-  struct VOIDTAG {
-  };  // Allow declaration of non-matching operator() with void argument tag.
-  struct REJECTTAG {
-  };  // Reject tagged operator() when using non-tagged execution policy.
-
-  using tag_type =
-      std::conditional_t<std::is_same<ArgTag, void>::value, VOIDTAG, ArgTag>;
-
-  //----------------------------------------
-  // parallel_for operator without a tag:
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember)
-                   const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember)
-                   const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&,
-                                     const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&,
-                                     const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class TagType, class ArgMember>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  //----------------------------------------
-  // parallel_for operator with a tag:
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, const ArgMember&,
-                                      const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&,
-                                      const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&) const) {}
-
-  template <class ArgMember>
-  KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&) const) {}
-
-  //----------------------------------------
-  // parallel_reduce operator without a tag:
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&, const ArgMember&,
-                                     const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, T&) const) {
-  }
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&,
-                                     const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, ArgMember, ArgMember,
-                                     ArgMember, ArgMember, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&,
-                                     const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  //----------------------------------------
-  // parallel_reduce operator with a tag:
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, ArgMember,
-                                      ArgMember, ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember,
-                            ArgMember, ArgMember, ArgMember, ArgMember,
-                            ArgMember, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, const ArgMember&,
-                                      const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&,
-                                      const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, T&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&,
-                            const ArgMember&, const ArgMember&, T&) const) {}
-
-  //----------------------------------------
-  // parallel_scan operator without a tag:
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, T&, bool) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, T&, bool) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, ArgMember, T&, bool) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, const ArgMember&, T&, bool) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(const TagType&, ArgMember, T&, bool) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, T&, bool)
-                   const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(ArgMember, T&, const bool&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const ArgMember&, T&, const bool&) const) {
-  }
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG,
-      void (FunctorType::*)(TagType, ArgMember, T&, const bool&) const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, T&, const bool&)
-                   const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, T&, const bool&)
-                   const) {}
-
-  template <class TagType, class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type(
-      VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, T&,
-                                     const bool&) const) {}
-  //----------------------------------------
-  // parallel_scan operator with a tag:
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, ArgMember, T&, bool) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(const tag_type&, ArgMember, T&, bool) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, const ArgMember&, T&, bool) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, T&,
-                                      bool) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type,
-      void (FunctorType::*)(tag_type, ArgMember, T&, const bool&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, ArgMember, T&,
-                                      const bool&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(tag_type, const ArgMember&, T&,
-                                      const bool&) const) {}
-
-  template <class ArgMember, class T>
-  KOKKOS_INLINE_FUNCTION static T deduce_reduce_type(
-      tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, T&,
-                                      const bool&) const) {}
-  //----------------------------------------
-
-  using ValueType =
-      decltype(deduce_reduce_type(tag_type(), &FunctorType::operator()));
-
-  enum { IS_VOID = std::is_same<VOIDTAG, ValueType>::value };
-  enum { IS_REJECT = std::is_same<REJECTTAG, ValueType>::value };
-
- public:
-  using value_type = std::conditional_t<IS_VOID || IS_REJECT, void, ValueType>;
-  using pointer_type =
-      std::conditional_t<IS_VOID || IS_REJECT, void, ValueType*>;
-  using reference_type =
-      std::conditional_t<IS_VOID || IS_REJECT, void, ValueType&>;
-  using functor_type = FunctorType;
-
-  static_assert(
-      IS_VOID || IS_REJECT ||
-          ((sizeof(ValueType) > sizeof(int))
-               ? 0 == sizeof(ValueType) % sizeof(int)
-               : true),
-      "Reduction functor's value_type deduced from functor::operator() "
-      "requires: 0 == sizeof(value_type) % sizeof(int)");
-
-  enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) };
-
-  KOKKOS_FORCEINLINE_FUNCTION static unsigned value_size(const FunctorType&) {
-    return StaticValueSize;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static unsigned value_count(const FunctorType&) {
-    return IS_VOID || IS_REJECT ? 0 : 1;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** Function signatures for FunctorType::init function with a tag.
- *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
- */
-template <class FunctorType, class ArgTag>
-struct FunctorValueInitFunction {
-  using reference_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::reference_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, reference_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, reference_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag,
-                                                        reference_type));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        reference_type));
-};
-
-/** Function signatures for FunctorType::init function without a tag.
- *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
- */
-template <class FunctorType>
-struct FunctorValueInitFunction<FunctorType, void> {
-  using reference_type =
-      typename FunctorValueTraits<FunctorType, void>::reference_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(reference_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(reference_type));
-};
-
-// Adapter for value initialization function.
-// If a proper FunctorType::init is declared then use it,
-// otherwise use default constructor.
-template <class FunctorType, class ArgTag,
-          class T = typename FunctorValueTraits<FunctorType, ArgTag>::
-              reference_type  // FIXME Fix FunctorValueTraits for multi-dim
-                              // operator
-          ,
-          class Enable = void>
-struct FunctorValueInit;
-
-/* No 'init' function provided for single value */
-template <class FunctorType, class ArgTag, class T, class Enable>
-struct FunctorValueInit<FunctorType, ArgTag, T&, Enable> {
-  KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType&, void* p) {
-    return *(new (p) T());
-  };
-};
-
-/* No 'init' function provided for array value */
-template <class FunctorType, class ArgTag, class T, class Enable>
-struct FunctorValueInit<FunctorType, ArgTag, T*, Enable> {
-  KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) {
-    const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f);
-    for (int i = 0; i < n; ++i) {
-      new (((T*)p) + i) T();
-    }
-    return (T*)p;
-  }
-};
-
-/* 'init' function provided for single value */
-template <class FunctorType, class T>
-struct FunctorValueInit<
-    FunctorType, void,
-    T&
-    // First  substitution failure when FunctorType::init does not exist.
-    // Second substitution failure when FunctorType::init is not compatible.
-    ,
-    decltype(FunctorValueInitFunction<FunctorType, void>::enable_if(
-        &FunctorType::init))> {
-  KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType& f, void* p) {
-    f.init(*((T*)p));
-    return *((T*)p);
-  }
-};
-
-/* 'init' function provided for array value */
-template <class FunctorType, class T>
-struct FunctorValueInit<
-    FunctorType, void,
-    T*
-    // First  substitution failure when FunctorType::init does not exist.
-    // Second substitution failure when FunctorType::init is not compatible
-    ,
-    decltype(FunctorValueInitFunction<FunctorType, void>::enable_if(
-        &FunctorType::init))> {
-  KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) {
-    f.init((T*)p);
-    return (T*)p;
-  }
-};
-
-/* 'init' function provided for single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueInit<
-    FunctorType, ArgTag,
-    T&
-    // First  substitution failure when FunctorType::init does not exist.
-    // Second substitution failure when FunctorType::init is not compatible.
-    ,
-    typename std::enable_if<
-        !std::is_same<ArgTag, void>::value,
-        decltype(FunctorValueInitFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::init))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType& f, void* p) {
-    f.init(ArgTag(), *((T*)p));
-    return *((T*)p);
-  }
-};
-
-/* 'init' function provided for array value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueInit<
-    FunctorType, ArgTag,
-    T*
-    // First  substitution failure when FunctorType::init does not exist.
-    // Second substitution failure when FunctorType::init is not compatible
-    ,
-    typename std::enable_if<
-        !std::is_same<ArgTag, void>::value,
-        decltype(FunctorValueInitFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::init))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) {
-    f.init(ArgTag(), (T*)p);
-    return (T*)p;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Signatures for compatible FunctorType::join with tag and not an array
-template <class FunctorType, class ArgTag,
-          bool IsArray =
-              0 == FunctorValueTraits<FunctorType, ArgTag>::StaticValueSize>
-struct FunctorValueJoinFunction {
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
-
-  using vref_type  = volatile value_type&;
-  using cvref_type = const volatile value_type&;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, vref_type, cvref_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, vref_type, cvref_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, vref_type,
-                                                        cvref_type));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        vref_type, cvref_type));
-};
-
-// Signatures for compatible FunctorType::join with tag and is an array
-template <class FunctorType, class ArgTag>
-struct FunctorValueJoinFunction<FunctorType, ArgTag, true> {
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
-
-  using vptr_type  = volatile value_type*;
-  using cvptr_type = const volatile value_type*;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, vptr_type, cvptr_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, vptr_type, cvptr_type) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, vptr_type,
-                                                        cvptr_type));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        vptr_type, cvptr_type));
-};
-
-// Signatures for compatible FunctorType::join without tag and not an array
-template <class FunctorType>
-struct FunctorValueJoinFunction<FunctorType, void, false> {
-  using value_type = typename FunctorValueTraits<FunctorType, void>::value_type;
-
-  using vref_type  = volatile value_type&;
-  using cvref_type = const volatile value_type&;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)(vref_type,
-                                                                     cvref_type)
-                                                   const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type, cvref_type));
-};
-
-// Signatures for compatible FunctorType::join without tag and is an array
-template <class FunctorType>
-struct FunctorValueJoinFunction<FunctorType, void, true> {
-  using value_type = typename FunctorValueTraits<FunctorType, void>::value_type;
-
-  using vptr_type  = volatile value_type*;
-  using cvptr_type = const volatile value_type*;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)(vptr_type,
-                                                                     cvptr_type)
-                                                   const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vptr_type, cvptr_type));
-};
-
-template <class FunctorType, class ArgTag,
-          class T =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type,
-          class Enable = void>
-struct FunctorValueJoin;
-
-/* No 'join' function provided, single value */
-template <class FunctorType, class ArgTag, class T, class Enable>
-struct FunctorValueJoin<FunctorType, ArgTag, T&, Enable> {
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType&) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& /*f*/,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    *((volatile T*)lhs) += *((const volatile T*)rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T& lhs, const volatile T& rhs) const { lhs += rhs; }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T& lhs, const T& rhs) const { lhs += rhs; }
-};
-
-/* No 'join' function provided, array of values */
-template <class FunctorType, class ArgTag, class T, class Enable>
-struct FunctorValueJoin<FunctorType, ArgTag, T*, Enable> {
-  const FunctorType& f;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType& f_) : f(f_) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f_);
-
-    for (int i = 0; i < n; ++i) {
-      ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i];
-    }
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T* const lhs, const volatile T* const rhs) const {
-    const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f);
-
-    for (int i = 0; i < n; ++i) {
-      lhs[i] += rhs[i];
-    }
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T* lhs, const T* rhs) const {
-    const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f);
-
-    for (int i = 0; i < n; ++i) {
-      lhs[i] += rhs[i];
-    }
-  }
-};
-
-/* 'join' function provided, single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueJoin<
-    FunctorType, ArgTag,
-    T&
-    // First  substitution failure when FunctorType::join does not exist.
-    // Second substitution failure when enable_if( & Functor::join ) does not
-    // exist
-    ,
-    decltype(FunctorValueJoinFunction<FunctorType, ArgTag>::enable_if(
-        &FunctorType::join))> {
-  const FunctorType& f;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType& f_) : f(f_) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    f_.join(ArgTag(), *((volatile T*)lhs), *((const volatile T*)rhs));
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T& lhs, const volatile T& rhs) const {
-    f.join(ArgTag(), lhs, rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T& lhs, const T& rhs) const { f.join(ArgTag(), lhs, rhs); }
-};
-
-/* 'join' function provided, no tag, single value */
-template <class FunctorType, class T>
-struct FunctorValueJoin<
-    FunctorType, void,
-    T&
-    // First  substitution failure when FunctorType::join does not exist.
-    // Second substitution failure when enable_if( & Functor::join ) does not
-    // exist
-    ,
-    decltype(FunctorValueJoinFunction<FunctorType, void>::enable_if(
-        &FunctorType::join))> {
-  const FunctorType& f;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType& f_) : f(f_) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    f_.join(*((volatile T*)lhs), *((const volatile T*)rhs));
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T& lhs, const volatile T& rhs) const {
-    f.join(lhs, rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T& lhs, const T& rhs) const { f.join(lhs, rhs); }
-};
-
-/* 'join' function provided for array value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueJoin<
-    FunctorType, ArgTag,
-    T*
-    // First  substitution failure when FunctorType::join does not exist.
-    // Second substitution failure when enable_if( & Functor::join ) does not
-    // exist
-    ,
-    decltype(FunctorValueJoinFunction<FunctorType, ArgTag>::enable_if(
-        &FunctorType::join))> {
-  const FunctorType& f;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType& f_) : f(f_) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    f_.join(ArgTag(), (volatile T*)lhs, (const volatile T*)rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T* const lhs, const volatile T* const rhs) const {
-    f.join(ArgTag(), lhs, rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T* lhs, const T* rhs) const { f.join(ArgTag(), lhs, rhs); }
-};
-
-/* 'join' function provided, no tag, array value */
-template <class FunctorType, class T>
-struct FunctorValueJoin<
-    FunctorType, void,
-    T*
-    // First  substitution failure when FunctorType::join does not exist.
-    // Second substitution failure when enable_if( & Functor::join ) does not
-    // exist
-    ,
-    decltype(FunctorValueJoinFunction<FunctorType, void>::enable_if(
-        &FunctorType::join))> {
-  const FunctorType& f;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  FunctorValueJoin(const FunctorType& f_) : f(f_) {}
-
-  KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_,
-                                               volatile void* const lhs,
-                                               const volatile void* const rhs) {
-    f_.join((volatile T*)lhs, (const volatile T*)rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(volatile T* const lhs, const volatile T* const rhs) const {
-    f.join(lhs, rhs);
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  void operator()(T* lhs, const T* rhs) const { f.join(lhs, rhs); }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-namespace Kokkos {
-
-namespace Impl {
-
-template <typename ValueType, class JoinOp, class Enable = void>
-struct JoinLambdaAdapter {
-  using value_type = ValueType;
-  const JoinOp& lambda;
-  KOKKOS_INLINE_FUNCTION
-  JoinLambdaAdapter(const JoinOp& lambda_) : lambda(lambda_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    lambda(dst, src);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(value_type& dst, const value_type& src) const { lambda(dst, src); }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(volatile value_type& dst,
-                  const volatile value_type& src) const {
-    lambda(dst, src);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(value_type& dst, const value_type& src) const {
-    lambda(dst, src);
-  }
-};
-
-template <typename ValueType, class JoinOp>
-struct JoinLambdaAdapter<ValueType, JoinOp,
-                         decltype(FunctorValueJoinFunction<
-                                  JoinOp, void>::enable_if(&JoinOp::join))> {
-  using value_type = ValueType;
-  static_assert(
-      std::is_same<ValueType, typename JoinOp::value_type>::value,
-      "JoinLambdaAdapter static_assert Fail: ValueType != JoinOp::value_type");
-
-  const JoinOp& lambda;
-  KOKKOS_INLINE_FUNCTION
-  JoinLambdaAdapter(const JoinOp& lambda_) : lambda(lambda_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    lambda.join(dst, src);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(value_type& dst, const value_type& src) const {
-    lambda.join(dst, src);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(volatile value_type& dst,
-                  const volatile value_type& src) const {
-    lambda.join(dst, src);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(value_type& dst, const value_type& src) const {
-    lambda.join(dst, src);
-  }
-};
-
-template <typename ValueType>
-struct JoinAdd {
-  using value_type = ValueType;
-
-  KOKKOS_DEFAULTED_FUNCTION
-  JoinAdd() = default;
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
-    dst += src;
-  }
-  KOKKOS_INLINE_FUNCTION
-  void operator()(value_type& dst, const value_type& src) const { dst += src; }
-  KOKKOS_INLINE_FUNCTION
-  void operator()(volatile value_type& dst,
-                  const volatile value_type& src) const {
-    dst += src;
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ArgTag,
-          class T =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type>
-struct FunctorValueOps;
-
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueOps<FunctorType, ArgTag, T&> {
-  KOKKOS_FORCEINLINE_FUNCTION static T* pointer(T& r) { return &r; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static T& reference(void* p) { return *((T*)p); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void copy(const FunctorType&,
-                                               void* const lhs,
-                                               const void* const rhs) {
-    *((T*)lhs) = *((const T*)rhs);
-  }
-};
-
-/* No 'join' function provided, array of values */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorValueOps<FunctorType, ArgTag, T*> {
-  KOKKOS_FORCEINLINE_FUNCTION static T* pointer(T* p) { return p; }
-
-  KOKKOS_FORCEINLINE_FUNCTION static T* reference(void* p) { return ((T*)p); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void copy(const FunctorType& f,
-                                               void* const lhs,
-                                               const void* const rhs) {
-    const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f);
-    for (int i = 0; i < n; ++i) {
-      ((T*)lhs)[i] = ((const T*)rhs)[i];
-    }
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Compatible functions for 'final' function and value_type not an array
-template <class FunctorType, class ArgTag,
-          bool IsArray =
-              0 == FunctorValueTraits<FunctorType, ArgTag>::StaticValueSize>
-struct FunctorFinalFunction {
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, value_type&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        value_type&));
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type const&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type const&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type const&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type const&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag,
-                                                        value_type const&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        value_type const&));
-};
-
-// Compatible functions for 'final' function and value_type is an array
-template <class FunctorType, class ArgTag>
-struct FunctorFinalFunction<FunctorType, ArgTag, true> {
-  using value_type =
-      typename FunctorValueTraits<FunctorType, ArgTag>::value_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, value_type*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        value_type*));
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type const*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type const*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, value_type const*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, value_type const*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag,
-                                                        value_type const*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        value_type const*));
-};
-
-template <class FunctorType>
-struct FunctorFinalFunction<FunctorType, void, false> {
-  using value_type = typename FunctorValueTraits<FunctorType, void>::value_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(value_type&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(value_type&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(value_type&));
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(const value_type&) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(const value_type&));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(const value_type&));
-};
-
-template <class FunctorType>
-struct FunctorFinalFunction<FunctorType, void, true> {
-  using value_type = typename FunctorValueTraits<FunctorType, void>::value_type;
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(value_type*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(value_type*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(value_type*));
-
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(const value_type*) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(const value_type*));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(const value_type*));
-};
-
-/* No 'final' function provided */
-template <class FunctorType, class ArgTag,
-          class ResultType =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type,
-          class Enable = void>
-struct FunctorFinal {
-  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType&, void*) {}
-};
-
-/* 'final' function provided for single value but no tag*/
-template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<
-    FunctorType, ArgTag,
-    T&
-    // First  substitution failure when FunctorType::final does not exist.
-    // Second substitution failure when FunctorType::final is not compatible.
-    ,
-    typename std::enable_if<
-        std::is_same<ArgTag, void>::value,
-        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::final))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
-    f.final(*((T*)p));
-  }
-};
-
-/* 'final' function provided for array value but no tag*/
-template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<
-    FunctorType, ArgTag,
-    T*
-    // First  substitution failure when FunctorType::final does not exist.
-    // Second substitution failure when FunctorType::final is not compatible.
-    ,
-    typename std::enable_if<
-        std::is_same<ArgTag, void>::value,
-        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::final))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
-    f.final((T*)p);
-  }
-};
-
-/* 'final' function provided for single value and with tag */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<
-    FunctorType, ArgTag,
-    T&
-    // First  substitution failure when FunctorType::final does not exist.
-    // Second substitution failure when FunctorType::final is not compatible.
-    ,
-    typename std::enable_if<
-        !std::is_same<ArgTag, void>::value,
-        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::final))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
-    f.final(ArgTag(), *((T*)p));
-  }
-};
-
-/* 'final' function provided for array value and with tag */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorFinal<
-    FunctorType, ArgTag,
-    T*
-    // First  substitution failure when FunctorType::final does not exist.
-    // Second substitution failure when FunctorType::final is not compatible.
-    ,
-    typename std::enable_if<
-        !std::is_same<ArgTag, void>::value,
-        decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if(
-            &FunctorType::final))>::type> {
-  KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) {
-    f.final(ArgTag(), (T*)p);
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_FUNCTORADAPTER_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index 7140154e0..6569e4901 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -61,6 +61,35 @@ struct FunctorPatternInterface {
   struct SCAN {};
 };
 
+template <typename T>
+struct DeduceFunctorPatternInterface;
+
+template <class FunctorType, class ExecPolicy, class ExecutionSpace>
+struct DeduceFunctorPatternInterface<
+    ParallelFor<FunctorType, ExecPolicy, ExecutionSpace>> {
+  using type = FunctorPatternInterface::FOR;
+};
+
+template <class FunctorType, class ExecPolicy, class ReducerType,
+          class ExecutionSpace>
+struct DeduceFunctorPatternInterface<
+    ParallelReduce<FunctorType, ExecPolicy, ReducerType, ExecutionSpace>> {
+  using type = FunctorPatternInterface::REDUCE;
+};
+
+template <class FunctorType, class ExecPolicy, class ExecutionSpace>
+struct DeduceFunctorPatternInterface<
+    ParallelScan<FunctorType, ExecPolicy, ExecutionSpace>> {
+  using type = FunctorPatternInterface::SCAN;
+};
+
+template <class FunctorType, class ExecPolicy, class ReturnType,
+          class ExecutionSpace>
+struct DeduceFunctorPatternInterface<ParallelScanWithTotal<
+    FunctorType, ExecPolicy, ReturnType, ExecutionSpace>> {
+  using type = FunctorPatternInterface::SCAN;
+};
+
 /** \brief  Query Functor and execution policy argument tag for value type.
  *
  *  If 'value_type' is not explicitly declared in the functor
@@ -79,17 +108,16 @@ struct FunctorAnalysis {
 
   //----------------------------------------
 
-  struct VOID {};
+  struct void_tag {};
 
   template <typename P = Policy, typename = std::false_type>
   struct has_work_tag {
     using type = void;
-    using wtag = VOID;
+    using wtag = void_tag;
   };
 
   template <typename P>
-  struct has_work_tag<P,
-                      typename std::is_same<typename P::work_tag, void>::type> {
+  struct has_work_tag<P, typename std::is_void<typename P::work_tag>::type> {
     using type = typename P::work_tag;
     using wtag = typename P::work_tag;
   };
@@ -108,7 +136,7 @@ struct FunctorAnalysis {
 
   template <typename T>
   struct has_execution_space<
-      T, typename std::is_same<typename T::execution_space, void>::type> {
+      T, typename std::is_void<typename T::execution_space>::type> {
     using type = typename T::execution_space;
     enum : bool { value = true };
   };
@@ -130,8 +158,8 @@ struct FunctorAnalysis {
   };
 
   template <typename F>
-  struct has_value_type<
-      F, typename std::is_same<typename F::value_type, void>::type> {
+  struct has_value_type<F,
+                        typename std::is_void<typename F::value_type>::type> {
     using type = typename F::value_type;
 
     static_assert(!std::is_reference<type>::value &&
@@ -147,7 +175,7 @@ struct FunctorAnalysis {
 
   template <typename F, typename P = PatternInterface,
             typename V = typename has_value_type<F>::type,
-            bool T     = std::is_same<Tag, void>::value>
+            bool T     = std::is_void<Tag>::value>
   struct deduce_value_type {
     using type = V;
   };
@@ -288,50 +316,46 @@ struct FunctorAnalysis {
   using candidate_type = typename deduce_value_type<Functor>::type;
 
   enum {
-    candidate_is_void  = std::is_same<candidate_type, void>::value,
+    candidate_is_void  = std::is_void<candidate_type>::value,
     candidate_is_array = std::rank<candidate_type>::value == 1
   };
 
   //----------------------------------------
 
  public:
-  using execution_space = typename std::conditional<
-      functor_has_space::value, typename functor_has_space::type,
-      typename std::conditional<policy_has_space::value,
-                                typename policy_has_space::type,
-                                Kokkos::DefaultExecutionSpace>::type>::type;
+  using execution_space =
+      std::conditional_t<functor_has_space::value,
+                         typename functor_has_space::type,
+                         std::conditional_t<policy_has_space::value,
+                                            typename policy_has_space::type,
+                                            Kokkos::DefaultExecutionSpace>>;
 
-  using value_type = typename std::remove_extent<candidate_type>::type;
+  using value_type = std::remove_extent_t<candidate_type>;
 
   static_assert(!std::is_const<value_type>::value,
                 "Kokkos functor operator reduce argument cannot be const");
 
  private:
   // Stub to avoid defining a type 'void &'
-  using ValueType =
-      typename std::conditional<candidate_is_void, VOID, value_type>::type;
+  using ValueType = std::conditional_t<candidate_is_void, void_tag, value_type>;
 
  public:
-  using pointer_type =
-      typename std::conditional<candidate_is_void, void, ValueType*>::type;
+  using pointer_type = std::conditional_t<candidate_is_void, void, ValueType*>;
 
-  using reference_type = typename std::conditional<
+  using reference_type = std::conditional_t<
       candidate_is_array, ValueType*,
-      typename std::conditional<!candidate_is_void, ValueType&,
-                                void>::type>::type;
+      std::conditional_t<!candidate_is_void, ValueType&, void>>;
 
  private:
   template <bool IsArray, class FF>
-  KOKKOS_INLINE_FUNCTION static constexpr
-      typename std::enable_if<IsArray, unsigned>::type
-      get_length(FF const& f) {
+  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<IsArray, unsigned>
+  get_length(FF const& f) {
     return f.value_count;
   }
 
   template <bool IsArray, class FF>
-  KOKKOS_INLINE_FUNCTION static constexpr
-      typename std::enable_if<!IsArray, unsigned>::type
-      get_length(FF const&) {
+  KOKKOS_INLINE_FUNCTION static constexpr std::enable_if_t<!IsArray, unsigned>
+  get_length(FF const&) {
     return candidate_is_void ? 0 : 1;
   }
 
@@ -367,29 +391,52 @@ struct FunctorAnalysis {
   }
 
  private:
-  enum INTERFACE : int {
-    DISABLE           = 0,
-    NO_TAG_NOT_ARRAY  = 1,
-    NO_TAG_IS_ARRAY   = 2,
-    HAS_TAG_NOT_ARRAY = 3,
-    HAS_TAG_IS_ARRAY  = 4,
-    DEDUCED =
-        !std::is_same<PatternInterface, REDUCE>::value
-            ? DISABLE
-            : (std::is_same<Tag, void>::value
-                   ? (candidate_is_array ? NO_TAG_IS_ARRAY : NO_TAG_NOT_ARRAY)
-                   : (candidate_is_array ? HAS_TAG_IS_ARRAY
-                                         : HAS_TAG_NOT_ARRAY))
-  };
-
   //----------------------------------------
   // parallel_reduce join operator
 
-  template <class F, INTERFACE>
-  struct has_join_function;
+  template <class F, bool is_array = candidate_is_array>
+  struct has_join_no_tag_function;
+
+  template <class F>
+  struct has_join_no_tag_function<F, /*is_array*/ false> {
+    using ref_type  = ValueType&;
+    using cref_type = const ValueType&;
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ref_type, cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
+      f->join(*dst, *src);
+    }
+  };
+
+  template <class F>
+  struct has_join_no_tag_function<F, /*is_array*/ true> {
+    using ref_type  = ValueType*;
+    using cref_type = const ValueType*;
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ref_type, cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
+      f->join(dst, src);
+    }
+  };
+
+  template <class F, bool is_array = candidate_is_array>
+  struct has_volatile_join_no_tag_function;
 
   template <class F>
-  struct has_join_function<F, NO_TAG_NOT_ARRAY> {
+  struct KOKKOS_DEPRECATED_WITH_COMMENT(
+      "Reduce/scan join() taking `volatile`-qualified parameters is "
+      "deprecated. Remove the `volatile` qualifier.")
+      has_volatile_join_no_tag_function<F, /*is_array*/ false> {
     using vref_type  = volatile ValueType&;
     using cvref_type = const volatile ValueType&;
 
@@ -399,15 +446,17 @@ struct FunctorAnalysis {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type,
                                                           cvref_type));
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const f,
-                                            ValueType volatile* dst,
-                                            ValueType volatile const* src) {
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
       f->join(*dst, *src);
     }
   };
 
   template <class F>
-  struct has_join_function<F, NO_TAG_IS_ARRAY> {
+  struct KOKKOS_DEPRECATED_WITH_COMMENT(
+      "Reduce/scan join() taking `volatile`-qualified parameters is "
+      "deprecated. Remove the `volatile` qualifier.")
+      has_volatile_join_no_tag_function<F, /*is_array*/ true> {
     using vref_type  = volatile ValueType*;
     using cvref_type = const volatile ValueType*;
 
@@ -417,15 +466,71 @@ struct FunctorAnalysis {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type,
                                                           cvref_type));
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const f,
-                                            ValueType volatile* dst,
-                                            ValueType volatile const* src) {
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
       f->join(dst, src);
     }
   };
 
+  template <class F, bool is_array = candidate_is_array>
+  struct has_join_tag_function;
+
   template <class F>
-  struct has_join_function<F, HAS_TAG_NOT_ARRAY> {
+  struct has_join_tag_function<F, /*is_array*/ false> {
+    using ref_type  = ValueType&;
+    using cref_type = const ValueType&;
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ref_type,
+                                                          cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&,
+                                                             ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, ref_type,
+                                                          cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
+      f->join(WTag(), *dst, *src);
+    }
+  };
+
+  template <class F>
+  struct has_join_tag_function<F, /*is_array*/ true> {
+    using ref_type  = ValueType*;
+    using cref_type = const ValueType*;
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ref_type,
+                                                          cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&,
+                                                             ref_type,
+                                                             cref_type) const);
+
+    KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, ref_type,
+                                                          cref_type));
+
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
+      f->join(WTag(), dst, src);
+    }
+  };
+
+  template <class F, bool is_array = candidate_is_array>
+  struct has_volatile_join_tag_function;
+
+  template <class F>
+  struct KOKKOS_DEPRECATED_WITH_COMMENT(
+      "Reduce/scan join() taking `volatile`-qualified parameters is "
+      "deprecated. Remove the `volatile` qualifier.")
+      has_volatile_join_tag_function<F, /*is_array*/ false> {
     using vref_type  = volatile ValueType&;
     using cvref_type = const volatile ValueType&;
 
@@ -443,15 +548,17 @@ struct FunctorAnalysis {
                                                           vref_type,
                                                           cvref_type));
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const f,
-                                            ValueType volatile* dst,
-                                            ValueType volatile const* src) {
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
       f->join(WTag(), *dst, *src);
     }
   };
 
   template <class F>
-  struct has_join_function<F, HAS_TAG_IS_ARRAY> {
+  struct KOKKOS_DEPRECATED_WITH_COMMENT(
+      "Reduce/scan join() taking `volatile`-qualified parameters is "
+      "deprecated. Remove the `volatile` qualifier.")
+      has_volatile_join_tag_function<F, /*is_array*/ true> {
     using vref_type  = volatile ValueType*;
     using cvref_type = const volatile ValueType*;
 
@@ -469,47 +576,112 @@ struct FunctorAnalysis {
                                                           vref_type,
                                                           cvref_type));
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const f,
-                                            ValueType volatile* dst,
-                                            ValueType volatile const* src) {
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
       f->join(WTag(), dst, src);
     }
   };
 
-  template <class F = Functor, INTERFACE = DEDUCED, typename = void>
-  struct DeduceJoin {
+  template <class F, class = void>
+  struct detected_join_no_tag {
     enum : bool { value = false };
+  };
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const f,
-                                            ValueType volatile* dst,
-                                            ValueType volatile const* src) {
+  template <class F>
+  struct detected_join_no_tag<
+      F, decltype(has_join_no_tag_function<F>::enable_if(&F::join))> {
+    enum : bool { value = true };
+  };
+
+  template <class F, class = void>
+  struct detected_volatile_join_no_tag {
+    enum : bool { value = false };
+  };
+
+  template <class F>
+  struct detected_volatile_join_no_tag<
+      F, decltype(has_volatile_join_no_tag_function<F>::enable_if(&F::join))> {
+    enum : bool { value = true };
+  };
+
+  template <class F, class = void>
+  struct detected_join_tag {
+    enum : bool { value = false };
+  };
+
+  template <class F>
+  struct detected_join_tag<F, decltype(has_join_tag_function<F>::enable_if(
+                                  &F::join))> {
+    enum : bool { value = true };
+  };
+
+  template <class F, class = void>
+  struct detected_volatile_join_tag {
+    enum : bool { value = false };
+  };
+
+  template <class F>
+  struct detected_volatile_join_tag<
+      F, decltype(has_volatile_join_tag_function<F>::enable_if(&F::join))> {
+    enum : bool { value = true };
+  };
+
+  template <class F = Functor, typename = void>
+  struct DeduceJoinNoTag {
+    enum : bool { value = false };
+
+    KOKKOS_INLINE_FUNCTION static void join(F const* const f, ValueType* dst,
+                                            ValueType const* src) {
       const int n = FunctorAnalysis::value_count(*f);
       for (int i = 0; i < n; ++i) dst[i] += src[i];
     }
   };
 
   template <class F>
-  struct DeduceJoin<F, DISABLE, void> {
-    enum : bool { value = false };
+  struct DeduceJoinNoTag<F, std::enable_if_t<(is_reducer<F>::value ||
+                                              (!is_reducer<F>::value &&
+                                               std::is_void<Tag>::value)) &&
+                                             detected_join_no_tag<F>::value>>
+      : public has_join_no_tag_function<F> {
+    enum : bool { value = true };
+  };
 
-    KOKKOS_INLINE_FUNCTION static void join(F const* const, ValueType volatile*,
-                                            ValueType volatile const*) {}
+  template <class F>
+  struct DeduceJoinNoTag<
+      F,
+      std::enable_if_t<(is_reducer<F>::value ||
+                        (!is_reducer<F>::value && std::is_void<Tag>::value)) &&
+                       (!detected_join_no_tag<F>::value &&
+                        detected_volatile_join_no_tag<F>::value)>>
+      : public has_volatile_join_no_tag_function<F> {
+    enum : bool { value = true };
   };
 
-  template <class F, INTERFACE I>
-  struct DeduceJoin<F, I,
-                    decltype(has_join_function<F, I>::enable_if(&F::join))>
-      : public has_join_function<F, I> {
+  template <class F = Functor, typename = void>
+  struct DeduceJoin : public DeduceJoinNoTag<F> {};
+
+  template <class F>
+  struct DeduceJoin<
+      F, std::enable_if_t<!is_reducer<F>::value && detected_join_tag<F>::value>>
+      : public has_join_tag_function<F> {
+    enum : bool { value = true };
+  };
+
+  template <class F>
+  struct DeduceJoin<F, std::enable_if_t<!is_reducer<F>::value &&
+                                        (!detected_join_tag<F>::value &&
+                                         detected_volatile_join_tag<F>::value)>>
+      : public has_volatile_join_tag_function<F> {
     enum : bool { value = true };
   };
 
   //----------------------------------------
 
-  template <class, INTERFACE>
-  struct has_init_function;
+  template <class, bool is_array = candidate_is_array>
+  struct has_init_no_tag_function;
 
   template <class F>
-  struct has_init_function<F, NO_TAG_NOT_ARRAY> {
+  struct has_init_no_tag_function<F, /*is_array*/ false> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType&) const);
 
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType&));
@@ -520,7 +692,7 @@ struct FunctorAnalysis {
   };
 
   template <class F>
-  struct has_init_function<F, NO_TAG_IS_ARRAY> {
+  struct has_init_no_tag_function<F, /*is_array*/ true> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType*) const);
 
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType*));
@@ -530,8 +702,11 @@ struct FunctorAnalysis {
     }
   };
 
+  template <class, bool is_array = candidate_is_array>
+  struct has_init_tag_function;
+
   template <class F>
-  struct has_init_function<F, HAS_TAG_NOT_ARRAY> {
+  struct has_init_tag_function<F, /*is_array*/ false> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType&)
                                                      const);
 
@@ -549,7 +724,7 @@ struct FunctorAnalysis {
   };
 
   template <class F>
-  struct has_init_function<F, HAS_TAG_IS_ARRAY> {
+  struct has_init_tag_function<F, /*is_array*/ true> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType*)
                                                      const);
 
@@ -566,37 +741,46 @@ struct FunctorAnalysis {
     }
   };
 
-  template <class F = Functor, INTERFACE = DEDUCED, typename = void>
-  struct DeduceInit {
+  template <class F = Functor, typename = void>
+  struct DeduceInitNoTag {
     enum : bool { value = false };
 
-    KOKKOS_INLINE_FUNCTION static void init(F const* const, ValueType* dst) {
-      new (dst) ValueType();
+    KOKKOS_INLINE_FUNCTION static void init(F const* const f, ValueType* dst) {
+      const int n = FunctorAnalysis::value_count(*f);
+      for (int i = 0; i < n; ++i) new (&dst[i]) ValueType();
     }
   };
 
   template <class F>
-  struct DeduceInit<F, DISABLE, void> {
-    enum : bool { value = false };
-
-    KOKKOS_INLINE_FUNCTION static void init(F const* const, ValueType*) {}
+  struct DeduceInitNoTag<
+      F, std::enable_if_t<is_reducer<F>::value || (!is_reducer<F>::value &&
+                                                   std::is_void<Tag>::value),
+                          decltype(has_init_no_tag_function<F>::enable_if(
+                              &F::init))>>
+      : public has_init_no_tag_function<F> {
+    enum : bool { value = true };
   };
 
-  template <class F, INTERFACE I>
-  struct DeduceInit<F, I,
-                    decltype(has_init_function<F, I>::enable_if(&F::init))>
-      : public has_init_function<F, I> {
+  template <class F = Functor, typename = void>
+  struct DeduceInit : public DeduceInitNoTag<F> {};
+
+  template <class F>
+  struct DeduceInit<
+      F,
+      std::enable_if_t<!is_reducer<F>::value,
+                       decltype(has_init_tag_function<F>::enable_if(&F::init))>>
+      : public has_init_tag_function<F> {
     enum : bool { value = true };
   };
 
   //----------------------------------------
 
-  template <class, INTERFACE>
-  struct has_final_function;
+  template <class, bool is_array = candidate_is_array>
+  struct has_final_no_tag_function;
 
   // No tag, not array
   template <class F>
-  struct has_final_function<F, NO_TAG_NOT_ARRAY> {
+  struct has_final_no_tag_function<F, /*is_array*/ false> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType&) const);
 
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType&));
@@ -608,7 +792,7 @@ struct FunctorAnalysis {
 
   // No tag, is array
   template <class F>
-  struct has_final_function<F, NO_TAG_IS_ARRAY> {
+  struct has_final_no_tag_function<F, /*is_array*/ true> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType*) const);
 
     KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType*));
@@ -618,9 +802,12 @@ struct FunctorAnalysis {
     }
   };
 
+  template <class, bool is_array = candidate_is_array>
+  struct has_final_tag_function;
+
   // Has tag, not array
   template <class F>
-  struct has_final_function<F, HAS_TAG_NOT_ARRAY> {
+  struct has_final_tag_function<F, /*is_array*/ false> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType&)
                                                      const);
 
@@ -639,7 +826,7 @@ struct FunctorAnalysis {
 
   // Has tag, is array
   template <class F>
-  struct has_final_function<F, HAS_TAG_IS_ARRAY> {
+  struct has_final_tag_function<F, /*is_array*/ true> {
     KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType*)
                                                      const);
 
@@ -656,18 +843,32 @@ struct FunctorAnalysis {
     }
   };
 
-  template <class F = Functor, INTERFACE = DEDUCED, typename = void>
-  struct DeduceFinal {
+  template <class F = Functor, typename = void>
+  struct DeduceFinalNoTag {
     enum : bool { value = false };
 
     KOKKOS_INLINE_FUNCTION
     static void final(F const* const, ValueType*) {}
   };
 
-  template <class F, INTERFACE I>
-  struct DeduceFinal<F, I,
-                     decltype(has_final_function<F, I>::enable_if(&F::final))>
-      : public has_final_function<F, I> {
+  template <class F>
+  struct DeduceFinalNoTag<
+      F, std::enable_if_t<is_reducer<F>::value || (!is_reducer<F>::value &&
+                                                   std::is_void<Tag>::value),
+                          decltype(has_final_no_tag_function<F>::enable_if(
+                              &F::final))>>
+      : public has_final_no_tag_function<F> {
+    enum : bool { value = true };
+  };
+
+  template <class F = Functor, typename = void>
+  struct DeduceFinal : public DeduceFinalNoTag<F> {};
+
+  template <class F>
+  struct DeduceFinal<F, std::enable_if_t<!is_reducer<F>::value,
+                                         decltype(has_final_tag_function<
+                                                  F>::enable_if(&F::final))>>
+      : public has_final_tag_function<F> {
     enum : bool { value = true };
   };
 
@@ -681,8 +882,7 @@ struct FunctorAnalysis {
   };
 
   template <class F>
-  struct DeduceTeamShmem<
-      F, typename std::enable_if<0 < sizeof(&F::team_shmem_size)>::type> {
+  struct DeduceTeamShmem<F, std::enable_if_t<0 < sizeof(&F::team_shmem_size)>> {
     enum : bool { value = true };
 
     static size_t team_shmem_size(F const* const f, int team_size) {
@@ -691,8 +891,9 @@ struct FunctorAnalysis {
   };
 
   template <class F>
-  struct DeduceTeamShmem<
-      F, typename std::enable_if<0 < sizeof(&F::shmem_size)>::type> {
+  struct DeduceTeamShmem<F,
+                         std::enable_if_t<(0 < sizeof(&F::shmem_size)) &&
+                                          !(0 < sizeof(&F::team_shmem_size))>> {
     enum : bool { value = true };
 
     static size_t team_shmem_size(F const* const f, int team_size) {
@@ -713,54 +914,44 @@ struct FunctorAnalysis {
   enum { has_init_member_function = DeduceInit<>::value };
   enum { has_final_member_function = DeduceFinal<>::value };
 
-  template <class MemorySpace = typename execution_space::memory_space>
+  static_assert((Kokkos::is_reducer<Functor>::value &&
+                 has_join_member_function) ||
+                    !Kokkos::is_reducer<Functor>::value,
+                "Reducer must have a join member function!");
+
   struct Reducer {
    private:
     Functor const* const m_functor;
-    ValueType* const m_result;
-
-    template <bool IsArray>
-    KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<IsArray,
-                                typename FunctorAnalysis::ValueType*>::type
-        ref() const noexcept {
-      return m_result;
-    }
 
     template <bool IsArray>
-    KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray,
-                                typename FunctorAnalysis::ValueType&>::type
-        ref() const noexcept {
-      return *m_result;
-    }
-
-    template <bool IsArray>
-    KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if<IsArray, int>::type
-    len() const noexcept {
+    KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<IsArray, int> len() const
+        noexcept {
       return m_functor->value_count;
     }
 
     template <bool IsArray>
-    KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray, int>::type
-        len() const noexcept {
+    KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<!IsArray, int> len() const
+        noexcept {
       return candidate_is_void ? 0 : 1;
     }
 
    public:
     using reducer        = Reducer;
-    using value_type     = FunctorAnalysis::value_type;
-    using memory_space   = MemorySpace;
+    using value_type     = std::remove_const_t<FunctorAnalysis::value_type>;
+    using pointer_type   = value_type*;
     using reference_type = FunctorAnalysis::reference_type;
     using functor_type   = Functor;  // Adapts a functor
 
-    KOKKOS_INLINE_FUNCTION constexpr value_type* data() const noexcept {
-      return m_result;
+    template <bool is_array = candidate_is_array>
+    KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_array, reference_type>
+    reference(ValueType* dst) noexcept {
+      return dst;
     }
 
-    KOKKOS_INLINE_FUNCTION constexpr reference_type reference() const noexcept {
-      return Reducer::template ref<candidate_is_array>();
+    template <bool is_array = candidate_is_array>
+    KOKKOS_INLINE_FUNCTION static std::enable_if_t<!is_array, reference_type>
+    reference(ValueType* dst) noexcept {
+      return *dst;
     }
 
     KOKKOS_INLINE_FUNCTION constexpr int length() const noexcept {
@@ -774,14 +965,14 @@ struct FunctorAnalysis {
     }
 
     KOKKOS_INLINE_FUNCTION
-    void join(ValueType volatile* dst, ValueType volatile const* src) const
-        noexcept {
+    void join(ValueType* dst, ValueType const* src) const noexcept {
       DeduceJoin<>::join(m_functor, dst, src);
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void init(ValueType* dst) const noexcept {
+    KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const
+        noexcept {
       DeduceInit<>::init(m_functor, dst);
+      return reference(dst);
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -793,13 +984,11 @@ struct FunctorAnalysis {
     Reducer(Reducer&&)      = default;
     Reducer& operator=(Reducer const&) = delete;
     Reducer& operator=(Reducer&&) = delete;
-
-    template <class S>
-    using rebind = Reducer<S>;
+    ~Reducer()                    = default;
 
     KOKKOS_INLINE_FUNCTION explicit constexpr Reducer(
-        Functor const* arg_functor = 0, ValueType* arg_value = nullptr) noexcept
-        : m_functor(arg_functor), m_result(arg_value) {}
+        Functor const* arg_functor) noexcept
+        : m_functor(arg_functor) {}
   };
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 6fc649cfc..d533ec05c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <cstddef>
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
index 4f93eebc0..9ad2dae55 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <impl/Kokkos_HostBarrier.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 1728fe90c..4bf904891 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <impl/Kokkos_Error.hpp>
@@ -50,7 +54,7 @@
 
 /*--------------------------------------------------------------------------*/
 
-#if defined(__INTEL_COMPILER) && !defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_COMPILER_INTEL) && !defined(KOKKOS_ENABLE_CUDA)
 
 // Intel specialized allocator does not interoperate with CUDA memory allocation
 
@@ -60,34 +64,6 @@
 
 /*--------------------------------------------------------------------------*/
 
-#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
-
-#include <unistd.h>
-#include <sys/mman.h>
-
-/* mmap flags for private anonymous memory allocation */
-
-#if defined(MAP_ANONYMOUS) && defined(MAP_PRIVATE)
-#define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
-#elif defined(MAP_ANON) && defined(MAP_PRIVATE)
-#define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
-#endif
-
-// mmap flags for huge page tables
-// the Cuda driver does not interoperate with MAP_HUGETLB
-#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS)
-#if defined(MAP_HUGETLB) && !defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE \
-  (KOKKOS_IMPL_POSIX_MMAP_FLAGS | MAP_HUGETLB)
-#else
-#define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE KOKKOS_IMPL_POSIX_MMAP_FLAGS
-#endif
-#endif
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-
 #include <cstddef>
 #include <cstdlib>
 #include <cstdint>
@@ -101,11 +77,6 @@
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
 
-#if (defined(KOKKOS_ENABLE_ASM) || defined(KOKKOS_ENABLE_TM)) && \
-    defined(KOKKOS_ENABLE_ISA_X86_64) && !defined(KOKKOS_COMPILER_PGI)
-#include <immintrin.h>
-#endif
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -116,10 +87,6 @@ HostSpace::HostSpace()
     : m_alloc_mech(
 #if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC)
           HostSpace::INTEL_MM_ALLOC
-#elif defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS)
-          HostSpace::POSIX_MMAP
-#elif defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
-          HostSpace::POSIX_MEMALIGN
 #else
           HostSpace::STD_MALLOC
 #endif
@@ -136,23 +103,12 @@ HostSpace::HostSpace(const HostSpace::AllocationMechanism &arg_alloc_mech)
   else if (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC) {
     m_alloc_mech = HostSpace::INTEL_MM_ALLOC;
   }
-#elif defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
-  else if (arg_alloc_mech == HostSpace::POSIX_MEMALIGN) {
-    m_alloc_mech = HostSpace::POSIX_MEMALIGN;
-  }
-#elif defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS)
-  else if (arg_alloc_mech == HostSpace::POSIX_MMAP) {
-    m_alloc_mech = HostSpace::POSIX_MMAP;
-  }
 #endif
   else {
     const char *const mech =
         (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC)
             ? "INTEL_MM_ALLOC"
-            : ((arg_alloc_mech == HostSpace::POSIX_MEMALIGN)
-                   ? "POSIX_MEMALIGN"
-                   : ((arg_alloc_mech == HostSpace::POSIX_MMAP) ? "POSIX_MMAP"
-                                                                : ""));
+            : ((arg_alloc_mech == HostSpace::POSIX_MMAP) ? "POSIX_MMAP" : "");
 
     std::string msg;
     msg.append("Kokkos::HostSpace ");
@@ -215,42 +171,6 @@ void *HostSpace::impl_allocate(
       ptr = _mm_malloc(arg_alloc_size, alignment);
     }
 #endif
-
-#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
-    else if (m_alloc_mech == POSIX_MEMALIGN) {
-      posix_memalign(&ptr, alignment, arg_alloc_size);
-    }
-#endif
-
-#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS)
-    else if (m_alloc_mech == POSIX_MMAP) {
-      constexpr size_t use_huge_pages = (1u << 27);
-      constexpr int prot              = PROT_READ | PROT_WRITE;
-      const int flags                 = arg_alloc_size < use_huge_pages
-                            ? KOKKOS_IMPL_POSIX_MMAP_FLAGS
-                            : KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE;
-
-      // read write access to private memory
-
-      ptr =
-          mmap(nullptr /* address hint, if nullptr OS kernel chooses address */
-               ,
-               arg_alloc_size /* size in bytes */
-               ,
-               prot /* memory protection */
-               ,
-               flags /* visibility of updates */
-               ,
-               -1 /* file descriptor */
-               ,
-               0 /* offset */
-          );
-
-      /* Associated reallocation:
-             ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
-      */
-    }
-#endif
   }
 
   if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) ||
@@ -324,18 +244,6 @@ void HostSpace::impl_deallocate(
       _mm_free(arg_alloc_ptr);
     }
 #endif
-
-#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
-    else if (m_alloc_mech == POSIX_MEMALIGN) {
-      free(arg_alloc_ptr);
-    }
-#endif
-
-#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS)
-    else if (m_alloc_mech == POSIX_MMAP) {
-      munmap(arg_alloc_ptr, arg_alloc_size);
-    }
-#endif
   }
 }
 
@@ -428,56 +336,18 @@ void init_lock_array_host_space() {
 }
 
 bool lock_address_host_space(void *ptr) {
-#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \
-    !defined(KOKKOS_COMPILER_PGI)
-  const unsigned status = _xbegin();
-
-  if (_XBEGIN_STARTED == status) {
-    const int val =
-        HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
-                                HOST_SPACE_ATOMIC_XOR_MASK];
-
-    if (0 == val) {
-      HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
-                              HOST_SPACE_ATOMIC_XOR_MASK] = 1;
-    } else {
-      _xabort(1);
-    }
-
-    _xend();
-
-    return 1;
-  } else {
-#endif
-    return 0 == atomic_compare_exchange(
-                    &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) &
-                                              HOST_SPACE_ATOMIC_MASK) ^
-                                             HOST_SPACE_ATOMIC_XOR_MASK],
-                    0, 1);
-#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \
-    !defined(KOKKOS_COMPILER_PGI)
-  }
-#endif
+  return 0 == atomic_compare_exchange(
+                  &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) &
+                                            HOST_SPACE_ATOMIC_MASK) ^
+                                           HOST_SPACE_ATOMIC_XOR_MASK],
+                  0, 1);
 }
 
 void unlock_address_host_space(void *ptr) {
-#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \
-    !defined(KOKKOS_COMPILER_PGI)
-  const unsigned status = _xbegin();
-
-  if (_XBEGIN_STARTED == status) {
-    HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
-                            HOST_SPACE_ATOMIC_XOR_MASK] = 0;
-  } else {
-#endif
-    atomic_exchange(
-        &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
-                                 HOST_SPACE_ATOMIC_XOR_MASK],
-        0);
-#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \
-    !defined(KOKKOS_COMPILER_PGI)
-  }
-#endif
+  atomic_exchange(
+      &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
+                               HOST_SPACE_ATOMIC_XOR_MASK],
+      0);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
index 4ac0941a3..e6ef73295 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include "Kokkos_Core.hpp"
 #include "Kokkos_HostSpace_deepcopy.hpp"
 
@@ -49,6 +53,10 @@ namespace Kokkos {
 
 namespace Impl {
 
+void hostspace_fence(const DefaultHostExecutionSpace& exec) {
+  exec.fence("HostSpace fence");
+}
+
 void hostspace_parallel_deepcopy(void* dst, const void* src, ptrdiff_t n) {
   Kokkos::DefaultHostExecutionSpace exec;
   hostspace_parallel_deepcopy_async(exec, dst, src, n);
@@ -67,13 +75,13 @@ void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec,
                                        void* dst, const void* src,
                                        ptrdiff_t n) {
   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>;
-  constexpr int host_deep_copy_serial_limit = 10 * 8192;
 
   // If the asynchronous HPX backend is enabled, do *not* copy anything
   // synchronously. The deep copy must be correctly sequenced with respect to
   // other kernels submitted to the same instance, so we only use the fallback
   // parallel_for version in this case.
 #if !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH))
+  constexpr int host_deep_copy_serial_limit = 10 * 8192;
   if ((n < host_deep_copy_serial_limit) ||
       (DefaultHostExecutionSpace().concurrency() == 1)) {
     std::memcpy(dst, src, n);
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp
index 6eec3566a..88d37672d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp
@@ -51,6 +51,8 @@ namespace Kokkos {
 
 namespace Impl {
 
+void hostspace_fence(const DefaultHostExecutionSpace& exec);
+
 void hostspace_parallel_deepcopy(void* dst, const void* src, ptrdiff_t n);
 // DeepCopy called with an execution space that can't access HostSpace
 void hostspace_parallel_deepcopy_async(void* dst, const void* src, ptrdiff_t n);
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index a7f4a652b..1f1acca5d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <limits>
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index 82aed1965..7f39f1886 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -49,7 +49,6 @@
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_Atomic.hpp>
 #include <Kokkos_ExecPolicy.hpp>
-#include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_HostBarrier.hpp>
 
@@ -113,10 +112,10 @@ class HostThreadTeamData {
   int64_t* m_team_scratch;  // == pool[ 0 + m_team_base ]->m_scratch
   int m_pool_rank;
   int m_pool_size;
-  int m_team_reduce;
-  int m_team_shared;
-  int m_thread_local;
-  int m_scratch_size;
+  size_t m_team_reduce;
+  size_t m_team_shared;
+  size_t m_thread_local;
+  size_t m_scratch_size;
   int m_team_base;
   int m_team_rank;
   int m_team_size;
@@ -184,7 +183,11 @@ class HostThreadTeamData {
 
   //----------------------------------------
 
-  constexpr HostThreadTeamData() noexcept
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC bug in NVHPC regarding constexpr
+                               // constructors used in device code
+  constexpr
+#endif
+      HostThreadTeamData() noexcept
       : m_work_range(-1, -1),
         m_work_end(0),
         m_scratch(nullptr),
@@ -205,7 +208,8 @@ class HostThreadTeamData {
         m_work_chunk(0),
         m_steal_rank(0),
         m_pool_rendezvous_step(0),
-        m_team_rendezvous_step(0) {}
+        m_team_rendezvous_step(0) {
+  }
 
   //----------------------------------------
   // Organize array of members into a pool.
@@ -247,33 +251,31 @@ class HostThreadTeamData {
 
   //----------------------------------------
 
- private:
-  enum : int { mask_to_16 = 0x0f };  // align to 16 bytes
-  enum : int { shift_to_8 = 3 };     // size to 8 bytes
-
  public:
-  static constexpr int align_to_int64(int n) {
+  static constexpr size_t align_to_int64(size_t n) {
+    constexpr size_t mask_to_16 = 0x0f;  // align to 16 bytes
+    constexpr size_t shift_to_8 = 3;     // size to 8 bytes
     return ((n + mask_to_16) & ~mask_to_16) >> shift_to_8;
   }
 
-  constexpr int pool_reduce_bytes() const {
+  constexpr size_t pool_reduce_bytes() const {
     return m_scratch_size ? sizeof(int64_t) * (m_team_reduce - m_pool_reduce)
                           : 0;
   }
 
-  constexpr int team_reduce_bytes() const {
+  constexpr size_t team_reduce_bytes() const {
     return sizeof(int64_t) * (m_team_shared - m_team_reduce);
   }
 
-  constexpr int team_shared_bytes() const {
+  constexpr size_t team_shared_bytes() const {
     return sizeof(int64_t) * (m_thread_local - m_team_shared);
   }
 
-  constexpr int thread_local_bytes() const {
+  constexpr size_t thread_local_bytes() const {
     return sizeof(int64_t) * (m_scratch_size - m_thread_local);
   }
 
-  constexpr int scratch_bytes() const {
+  constexpr size_t scratch_bytes() const {
     return sizeof(int64_t) * m_scratch_size;
   }
 
@@ -310,8 +312,9 @@ class HostThreadTeamData {
   //   thread_local_size = number bytes for thread local memory
   // Return:
   //   total number of bytes that must be allocated
-  static size_t scratch_size(int pool_reduce_size, int team_reduce_size,
-                             int team_shared_size, int thread_local_size) {
+  static size_t scratch_size(size_t pool_reduce_size, size_t team_reduce_size,
+                             size_t team_shared_size,
+                             size_t thread_local_size) {
     pool_reduce_size  = align_to_int64(pool_reduce_size);
     team_reduce_size  = align_to_int64(team_reduce_size);
     team_shared_size  = align_to_int64(team_shared_size);
@@ -336,7 +339,7 @@ class HostThreadTeamData {
   //   total number of bytes that must be allocated
   void scratch_assign(void* const alloc_ptr, size_t const alloc_size,
                       int pool_reduce_size, int team_reduce_size,
-                      int team_shared_size, int /* thread_local_size */) {
+                      size_t team_shared_size, size_t /* thread_local_size */) {
     pool_reduce_size = align_to_int64(pool_reduce_size);
     team_reduce_size = align_to_int64(team_reduce_size);
     team_shared_size = align_to_int64(team_shared_size);
@@ -556,18 +559,15 @@ class HostThreadTeamMember {
   // team_reduce( Max(result) );
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer) const noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer) const noexcept {
     team_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION
-      typename std::enable_if<is_reducer<ReducerType>::value>::type
-      team_reduce(ReducerType const& reducer,
-                  typename ReducerType::value_type contribution) const
-      noexcept {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
+  team_reduce(ReducerType const& reducer,
+              typename ReducerType::value_type contribution) const noexcept {
     KOKKOS_IF_ON_HOST((
         if (1 < m_data.m_team_size) {
           using value_type = typename ReducerType::value_type;
@@ -689,61 +689,59 @@ template <typename iType, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
 TeamThreadRange(
     Member const& member, iType count,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
 }
 
 template <typename iType1, typename iType2, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Member>
+    std::common_type_t<iType1, iType2>, Member>
 TeamThreadRange(
     Member const& member, iType1 begin, iType2 end,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::TeamThreadRangeBoundariesStruct<
-      typename std::common_type<iType1, iType2>::type, Member>(member, begin,
-                                                               end);
+      std::common_type_t<iType1, iType2>, Member>(member, begin, end);
 }
 
 template <typename iType, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member>
 TeamVectorRange(
     Member const& member, iType count,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count);
 }
 
 template <typename iType1, typename iType2, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Member>
+    std::common_type_t<iType1, iType2>, Member>
 TeamVectorRange(
     Member const& member, iType1 begin, iType2 end,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::TeamThreadRangeBoundariesStruct<
-      typename std::common_type<iType1, iType2>::type, Member>(member, begin,
-                                                               end);
+      std::common_type_t<iType1, iType2>, Member>(member, begin, end);
 }
 
 template <typename iType, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType, Member>
 ThreadVectorRange(
     Member const& member, iType count,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(member, count);
 }
 
 template <typename iType1, typename iType2, typename Member>
 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
-    typename std::common_type<iType1, iType2>::type, Member>
+    std::common_type_t<iType1, iType2>, Member>
 ThreadVectorRange(
     Member const& member, iType1 arg_begin, iType2 arg_end,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
-  using iType = typename std::common_type<iType1, iType2>::type;
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
+  using iType = std::common_type_t<iType1, iType2>;
   return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(
       member, iType(arg_begin), iType(arg_end));
 }
@@ -759,8 +757,8 @@ template <typename iType, class Closure, class Member>
 KOKKOS_INLINE_FUNCTION void parallel_for(
     Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
     Closure const& closure,
-    typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
-        type const** = nullptr) {
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value> const** =
+        nullptr) {
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
     closure(i);
@@ -772,8 +770,8 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
         loop_boundaries,
     Closure const& closure,
-    typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>::
-        type const** = nullptr) {
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value> const** =
+        nullptr) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -786,12 +784,12 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
 //----------------------------------------------------------------------------
 
 template <typename iType, class Closure, class Reducer, class Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Kokkos::is_reducer<Reducer>::value &&
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_reduce(
-    Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
-    Closure const& closure, Reducer const& reducer) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer<Reducer>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
+    parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
+                        loop_boundaries,
+                    Closure const& closure, Reducer const& reducer) {
   typename Reducer::value_type value;
   reducer.init(value);
 
@@ -804,12 +802,12 @@ parallel_reduce(
 }
 
 template <typename iType, typename Closure, typename ValueType, typename Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    !Kokkos::is_reducer<ValueType>::value &&
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_reduce(
-    Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
-    Closure const& closure, ValueType& result) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<!Kokkos::is_reducer<ValueType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
+    parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
+                        loop_boundaries,
+                    Closure const& closure, ValueType& result) {
   ValueType val;
   Sum<ValueType> reducer(val);
   reducer.init(val);
@@ -858,12 +856,12 @@ Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
  *  performed and put into result.
  */
 template <typename iType, class Lambda, typename ValueType, typename Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    !Kokkos::is_reducer<ValueType>::value &&
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
-                    loop_boundaries,
-                const Lambda& lambda, ValueType& result) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<!Kokkos::is_reducer<ValueType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
+    parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                        iType, Member>& loop_boundaries,
+                    const Lambda& lambda, ValueType& result) {
   result = ValueType();
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
@@ -872,12 +870,12 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
 }
 
 template <typename iType, class Lambda, typename ReducerType, typename Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Kokkos::is_reducer<ReducerType>::value &&
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
-                    loop_boundaries,
-                const Lambda& lambda, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer<ReducerType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
+    parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<
+                        iType, Member>& loop_boundaries,
+                    const Lambda& lambda, const ReducerType& reducer) {
   reducer.init(reducer.reference());
   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
        i += loop_boundaries.increment) {
@@ -888,11 +886,11 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
 //----------------------------------------------------------------------------
 
 template <typename iType, class Closure, class Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_scan(
-    Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries,
-    Closure const& closure) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
+                      loop_boundaries,
+                  Closure const& closure) {
   // Extract ValueType from the closure
 
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
@@ -916,11 +914,11 @@ parallel_scan(
 }
 
 template <typename iType, class ClosureType, class Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
-                  loop_boundaries,
-              ClosureType const& closure) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
+                      loop_boundaries,
+                  ClosureType const& closure) {
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type;
 
@@ -936,12 +934,12 @@ parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const&
 }
 
 template <typename iType, class Lambda, typename ReducerType, typename Member>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Kokkos::is_reducer<ReducerType>::value &&
-    Impl::is_host_thread_team_member<Member>::value>::type
-parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
-                  loop_boundaries,
-              const Lambda& lambda, const ReducerType& reducer) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Kokkos::is_reducer<ReducerType>::value &&
+                     Impl::is_host_thread_team_member<Member>::value>
+    parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
+                      loop_boundaries,
+                  const Lambda& lambda, const ReducerType& reducer) {
   typename ReducerType::value_type scan_val;
   reducer.init(scan_val);
 
@@ -959,48 +957,49 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>&
 template <class Member>
 KOKKOS_INLINE_FUNCTION Impl::ThreadSingleStruct<Member> PerTeam(
     Member const& member,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::ThreadSingleStruct<Member>(member);
 }
 
 template <class Member>
 KOKKOS_INLINE_FUNCTION Impl::VectorSingleStruct<Member> PerThread(
     Member const& member,
-    typename std::enable_if<
-        Impl::is_thread_team_member<Member>::value>::type const** = nullptr) {
+    std::enable_if_t<Impl::is_thread_team_member<Member>::value> const** =
+        nullptr) {
   return Impl::VectorSingleStruct<Member>(member);
 }
 
 template <class Member, class FunctorType>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-single(const Impl::ThreadSingleStruct<Member>& single,
-       const FunctorType& functor) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    single(const Impl::ThreadSingleStruct<Member>& single,
+           const FunctorType& functor) {
   // 'single' does not perform a barrier.
   if (single.team_member.team_rank() == 0) functor();
 }
 
 template <class Member, class FunctorType, typename ValueType>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-single(const Impl::ThreadSingleStruct<Member>& single,
-       const FunctorType& functor, ValueType& val) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    single(const Impl::ThreadSingleStruct<Member>& single,
+           const FunctorType& functor, ValueType& val) {
   single.team_member.team_broadcast(functor, val, 0);
 }
 
 template <class Member, class FunctorType>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    single(const Impl::VectorSingleStruct<Member>&,
+           const FunctorType& functor) {
   functor();
 }
 
 template <class Member, class FunctorType, typename ValueType>
-KOKKOS_INLINE_FUNCTION typename std::enable_if<
-    Impl::is_host_thread_team_member<Member>::value>::type
-single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor,
-       ValueType& val) {
+KOKKOS_INLINE_FUNCTION
+    std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
+    single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor,
+           ValueType& val) {
   functor(val);
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
new file mode 100644
index 000000000..ceb35f024
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
@@ -0,0 +1,195 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_INITIALIZATION_SETTINGS_HPP
+#define KOKKOS_INITIALIZATION_SETTINGS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <climits>
+#include <string>
+
+namespace Kokkos {
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+struct InitArguments {
+  int num_threads;
+  int num_numa;
+  int device_id;
+  int ndevices;
+  int skip_device;
+  bool disable_warnings;
+  bool tune_internals;
+  bool tool_help        = false;
+  std::string tool_lib  = {};
+  std::string tool_args = {};
+
+  KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!")
+  InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false,
+                bool ti = false)
+      : num_threads{nt},
+        num_numa{nn},
+        device_id{dv},
+        ndevices{-1},
+        skip_device{9999},
+        disable_warnings{dw},
+        tune_internals{ti} {}
+};
+#endif
+
+namespace Impl {
+// FIXME_CXX17 replace with std::optional
+template <class>
+struct InitializationSettingsHelper;
+template <>
+struct InitializationSettingsHelper<int> {
+  using value_type   = int;
+  using storage_type = int;
+
+  static constexpr storage_type unspecified = INT_MIN;
+};
+template <>
+struct InitializationSettingsHelper<bool> {
+  using value_type   = bool;
+  using storage_type = char;
+
+  static constexpr storage_type unspecified = CHAR_MAX;
+  static_assert(static_cast<storage_type>(true) != unspecified &&
+                    static_cast<storage_type>(false) != unspecified,
+                "");
+};
+template <>
+struct InitializationSettingsHelper<std::string> {
+  using value_type   = std::string;
+  using storage_type = std::string;
+
+  static storage_type const unspecified;
+};
+}  // namespace Impl
+
+class InitializationSettings {
+#define KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER(NAME) \
+  impl_do_not_use_i_really_mean_it_##NAME##_
+
+#define KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE(NAME) impl_##NAME##_type
+
+#define KOKKOS_IMPL_DECLARE(TYPE, NAME)                                      \
+ private:                                                                    \
+  using KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE(NAME) = TYPE;                 \
+  Impl::InitializationSettingsHelper<TYPE>::storage_type                     \
+      KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER(NAME) =                              \
+          Impl::InitializationSettingsHelper<TYPE>::unspecified;             \
+                                                                             \
+ public:                                                                     \
+  InitializationSettings& set_##NAME(                                        \
+      Impl::InitializationSettingsHelper<TYPE>::value_type NAME) {           \
+    KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER(NAME) = NAME;                          \
+    return *this;                                                            \
+  }                                                                          \
+  bool has_##NAME() const noexcept {                                         \
+    return KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER(NAME) !=                        \
+           Impl::InitializationSettingsHelper<                               \
+               KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE(NAME)>::unspecified;   \
+  }                                                                          \
+  KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE(NAME) get_##NAME() const noexcept { \
+    return KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER(NAME);                          \
+  }                                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
+
+ public:
+  KOKKOS_IMPL_DECLARE(int, num_threads);
+  KOKKOS_IMPL_DECLARE(int, device_id);
+  KOKKOS_IMPL_DECLARE(std::string, map_device_id_by);
+  KOKKOS_IMPL_DECLARE(int, num_devices);  // deprecated
+  KOKKOS_IMPL_DECLARE(int, skip_device);  // deprecated
+  KOKKOS_IMPL_DECLARE(bool, disable_warnings);
+  KOKKOS_IMPL_DECLARE(bool, print_configuration);
+  KOKKOS_IMPL_DECLARE(bool, tune_internals);
+  KOKKOS_IMPL_DECLARE(bool, tools_help);
+  KOKKOS_IMPL_DECLARE(std::string, tools_libs);
+  KOKKOS_IMPL_DECLARE(std::string, tools_args);
+
+#undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE
+#undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER
+#undef KOKKOS_IMPL_DECLARE
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+ public:
+  InitializationSettings() = default;
+
+  InitializationSettings(InitArguments const& old) {
+    if (old.num_threads != -1) {
+      set_num_threads(old.num_threads);
+    }
+    if (old.device_id != -1) {
+      set_device_id(old.device_id);
+    }
+    if (old.ndevices != -1) {
+      set_num_devices(old.ndevices);
+    }
+    if (old.skip_device != 9999) {
+      set_skip_device(old.skip_device);
+    }
+    if (old.disable_warnings) {
+      set_disable_warnings(true);
+    }
+    if (old.tune_internals) {
+      set_tune_internals(true);
+    }
+    if (old.tool_help) {
+      set_tools_help(true);
+    }
+    if (!old.tool_lib.empty()) {
+      set_tools_libs(old.tool_lib);
+    }
+    if (!old.tool_args.empty()) {
+      set_tools_args(old.tool_args);
+    }
+  }
+#endif
+};
+
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp
index 683c5c9b1..286c56743 100644
--- a/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp
@@ -77,7 +77,7 @@ struct LockBasedLIFOCommon {
   static constexpr uintptr_t LockTag = ~uintptr_t(0);
   static constexpr uintptr_t EndTag  = ~uintptr_t(1);
 
-  OwningRawPtr<node_type> m_head = (node_type*)EndTag;
+  OwningRawPtr<node_type> m_head = reinterpret_cast<node_type*>(EndTag);
 
   KOKKOS_INLINE_FUNCTION
   bool _try_push_node(node_type& node) {
@@ -89,7 +89,7 @@ struct LockBasedLIFOCommon {
     auto* old_head = m_head;
 
     // retry until someone locks the queue or we successfully compare exchange
-    while (old_head != (node_type*)LockTag) {
+    while (old_head != reinterpret_cast<node_type*>(LockTag)) {
       // TODO @tasking @memory_order DSH this should have a memory order and not
       // a memory fence
 
@@ -132,7 +132,8 @@ struct LockBasedLIFOCommon {
   bool _is_empty() const noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
     // order
-    return (volatile node_type*)this->m_head == (node_type*)EndTag;
+    return (volatile node_type*)this->m_head ==
+           reinterpret_cast<node_type*>(EndTag);
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
index 889d821bb..f82e88fad 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <impl/Kokkos_Error.hpp>
 
 #include <ostream>
diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp
index 221840576..7dede48a1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp
@@ -89,7 +89,7 @@ class MemoryPoolAllocator {
   using value_type      = T;
   using pointer         = T*;
   using size_type       = typename MemoryPool::memory_space::size_type;
-  using difference_type = typename std::make_signed<size_type>::type;
+  using difference_type = std::make_signed_t<size_type>;
 
   template <class U>
   struct rebind {
diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp
index ec2e573c0..a80ea0a1d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp
@@ -47,6 +47,10 @@
  *  implementations thereof.
  */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <impl/Kokkos_MemorySpace.hpp>
 
 #include <iostream>
diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp
index 5b3764686..dee11bbdb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp
@@ -78,6 +78,20 @@ SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space,
   return nullptr;  // unreachable
 }
 
+template <class ExecutionSpace, class MemorySpace>
+SharedAllocationHeader *checked_allocation_with_header(
+    ExecutionSpace const &exec_space, MemorySpace const &space,
+    std::string const &label, size_t alloc_size) {
+  try {
+    return reinterpret_cast<SharedAllocationHeader *>(space.allocate(
+        exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader),
+        alloc_size));
+  } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) {
+    safe_throw_allocation_with_header_failure(space.name(), label, failure);
+  }
+  return nullptr;  // unreachable
+}
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index f68708996..1df5d13b9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -58,8 +58,7 @@ void memory_fence() {
 #elif defined(__HIP_DEVICE_COMPILE__)
   __threadfence();
 #elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-  sycl::atomic_fence(sycl::ext::oneapi::memory_order::acq_rel,
-                     sycl::ext::oneapi::memory_scope::device);
+  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
 #elif defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64)
   asm volatile("mfence" ::: "memory");
 #elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || \
diff --git a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
index 1c61b73f0..209ba1920 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
@@ -64,7 +64,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -102,10 +101,10 @@ struct MultipleTaskQueueTeamEntry {
   using ready_queue_type =
       typename TaskQueueTraits::template ready_queue_type<task_base_type>;
   using task_queue_traits         = TaskQueueTraits;
-  using task_scheduling_info_type = typename std::conditional<
+  using task_scheduling_info_type = std::conditional_t<
       TaskQueueTraits::ready_queue_insertion_may_fail,
       FailedQueueInsertionLinkedListSchedulingInfo<TaskQueueTraits>,
-      EmptyTaskSchedulingInfo>::type;
+      EmptyTaskSchedulingInfo>;
 
  private:
   // Number of allowed priorities
@@ -123,10 +122,9 @@ struct MultipleTaskQueueTeamEntry {
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion(
       int priority, TaskType type,
-      typename std::enable_if<
-          task_queue_traits::ready_queue_insertion_may_fail &&
-              std::is_void<_always_void>::value,
-          void*>::type = nullptr) {
+      std::enable_if_t<task_queue_traits::ready_queue_insertion_may_fail &&
+                           std::is_void<_always_void>::value,
+                       void*> = nullptr) {
     auto* rv_ptr = m_failed_heads[priority][(int)type];
     if (rv_ptr) {
       m_failed_heads[priority][(int)type] =
@@ -142,10 +140,9 @@ struct MultipleTaskQueueTeamEntry {
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion(
       int /*priority*/, TaskType /*type*/,
-      typename std::enable_if<
-          !task_queue_traits::ready_queue_insertion_may_fail &&
-              std::is_void<_always_void>::value,
-          void*>::type = nullptr) {
+      std::enable_if_t<!task_queue_traits::ready_queue_insertion_may_fail &&
+                           std::is_void<_always_void>::value,
+                       void*> = nullptr) {
     return OptionalRef<task_base_type>{nullptr};
   }
 
@@ -201,10 +198,9 @@ struct MultipleTaskQueueTeamEntry {
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion(
       runnable_task_base_type&& task,
-      typename std::enable_if<
-          task_queue_traits::ready_queue_insertion_may_fail &&
-              std::is_void<_always_void>::value,
-          void*>::type = nullptr) {
+      std::enable_if_t<task_queue_traits::ready_queue_insertion_may_fail &&
+                           std::is_void<_always_void>::value,
+                       void*> = nullptr) {
     // failed insertions, if they happen, must be from the only thread that
     // is allowed to push to m_ready_queues, so this linked-list insertion is
     // not concurrent
@@ -217,21 +213,20 @@ struct MultipleTaskQueueTeamEntry {
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion(
       runnable_task_base_type&& /*task*/,
-      typename std::enable_if<
-          !task_queue_traits::ready_queue_insertion_may_fail &&
-              std::is_void<_always_void>::value,
-          void*>::type = nullptr) {
+      std::enable_if_t<!task_queue_traits::ready_queue_insertion_may_fail &&
+                           std::is_void<_always_void>::value,
+                       void*> = nullptr) {
     Kokkos::abort("should be unreachable!");
   }
 
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION void flush_failed_insertions(
       int priority, int task_type,
-      typename std::enable_if<
+      std::enable_if_t<
           task_queue_traits::ready_queue_insertion_may_fail &&
               std::is_void<_always_void>::value,  // just to make this dependent
                                                   // on template parameter
-          int>::type = 0) {
+          int> = 0) {
     // TODO @tasking @minor DSH this somethimes gets some things out of LIFO
     // order, which may be undesirable (but not a bug)
 
@@ -256,11 +251,11 @@ struct MultipleTaskQueueTeamEntry {
   template <class _always_void = void>
   KOKKOS_INLINE_FUNCTION void flush_failed_insertions(
       int, int,
-      typename std::enable_if<
+      std::enable_if_t<
           !task_queue_traits::ready_queue_insertion_may_fail &&
               std::is_void<_always_void>::value,  // just to make this dependent
                                                   // on template parameter
-          int>::type = 0) {}
+          int> = 0) {}
 
   KOKKOS_INLINE_FUNCTION
   void flush_all_failed_insertions() {
@@ -353,10 +348,10 @@ class MultipleTaskQueue final
     ~SchedulerInfo() = default;
   };
 
-  using task_scheduling_info_type = typename std::conditional<
+  using task_scheduling_info_type = std::conditional_t<
       TaskQueueTraits::ready_queue_insertion_may_fail,
       FailedQueueInsertionLinkedListSchedulingInfo<TaskQueueTraits>,
-      EmptyTaskSchedulingInfo>::type;
+      EmptyTaskSchedulingInfo>;
   using team_scheduler_info_type = SchedulerInfo;
 
   using runnable_task_base_type = RunnableTaskBase<TaskQueueTraits>;
diff --git a/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp
index 71067b8e1..5ff094032 100644
--- a/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp
@@ -1,3 +1,8 @@
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_NumericTraits.hpp>
 
 // NOTE These out-of class definitions are only required with C++14.  Since
diff --git a/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp b/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp
index 12f6c9f5f..1a3cbaba2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp
@@ -120,18 +120,14 @@ struct OptionalRef {
   //----------------------------------------
 
   KOKKOS_INLINE_FUNCTION
-  OptionalRef<typename std::add_volatile<T>::type>
-  as_volatile() volatile noexcept {
-    return OptionalRef<typename std::add_volatile<T>::type>(*(*this));
+  OptionalRef<std::add_volatile_t<T>> as_volatile() volatile noexcept {
+    return OptionalRef<std::add_volatile_t<T>>(*(*this));
   }
 
   KOKKOS_INLINE_FUNCTION
-  OptionalRef<
-      typename std::add_volatile<typename std::add_const<T>::type>::type>
-  as_volatile() const volatile noexcept {
-    return OptionalRef<
-        typename std::add_volatile<typename std::add_const<T>::type>::type>(
-        *(*this));
+  OptionalRef<std::add_volatile_t<std::add_const_t<T>>> as_volatile() const
+      volatile noexcept {
+    return OptionalRef<std::add_volatile_t<std::add_const_t<T>>>(*(*this));
   }
 
   //----------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp b/packages/kokkos/core/src/impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp
new file mode 100644
index 000000000..4fdb85b6a
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp
@@ -0,0 +1,58 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PARSE_COMMAND_LINE_ARGUMENTS_AND_ENVIRONMENT_VARIABLES_HPP
+#define KOKKOS_PARSE_COMMAND_LINE_ARGUMENTS_AND_ENVIRONMENT_VARIABLES_HPP
+
+// These declaration are only provided for testing purposes
+namespace Kokkos {
+class InitializationSettings;
+namespace Impl {
+void parse_command_line_arguments(int& argc, char* argv[],
+                                  InitializationSettings& settings);
+void parse_environment_variables(InitializationSettings& settings);
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 9c8118e2b..480b1a392 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #ifndef KOKKOS_TOOLS_INDEPENDENT_BUILD
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
@@ -67,6 +71,34 @@
 #include <vector>
 #include <sstream>
 #include <iostream>
+
+namespace {
+void warn_cmd_line_arg_ignored_when_kokkos_tools_disabled(char const* arg) {
+#ifndef KOKKOS_TOOLS_ENABLE_LIBDL
+  if (Kokkos::show_warnings()) {
+    std::cerr << "Warning: command line argument '" << arg
+              << "' ignored because kokkos-tools is disabled."
+              << " Raised by Kokkos::initialize()." << std::endl;
+  }
+#else
+  (void)arg;
+#endif
+}
+void warn_env_var_ignored_when_kokkos_tools_disabled(char const* env_var,
+                                                     char const* val) {
+#ifndef KOKKOS_TOOLS_ENABLE_LIBDL
+  if (Kokkos::show_warnings()) {
+    std::cerr << "Warning: environment variable '" << env_var << "=" << val
+              << "' ignored because kokkos-tools is disabled."
+              << " Raised by Kokkos::initialize()." << std::endl;
+  }
+#else
+  (void)env_var;
+  (void)val;
+#endif
+}
+}  // namespace
+
 namespace Kokkos {
 
 namespace Tools {
@@ -77,34 +109,29 @@ const std::string InitArguments::unset_string_option = {
 InitArguments tool_arguments;
 
 namespace Impl {
-void parse_command_line_arguments(int& narg, char* arg[],
+void parse_command_line_arguments(int& argc, char* argv[],
                                   InitArguments& arguments) {
   int iarg = 0;
   using Kokkos::Impl::check_arg;
-  using Kokkos::Impl::check_int_arg;
-  using Kokkos::Impl::check_str_arg;
-
-  auto& lib            = arguments.lib;
-  auto& args           = arguments.args;
-  auto& help           = arguments.help;
-  auto& tune_internals = arguments.tune_internals;
-  while (iarg < narg) {
-    if (check_arg(arg[iarg], "--kokkos-tune-internals")) {
-      tune_internals = InitArguments::PossiblyUnsetOption::on;
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
-      }
-      narg--;
-    } else if (check_str_arg(arg[iarg], "--kokkos-tools-library", lib)) {
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
-      }
-      narg--;
-    } else if (check_str_arg(arg[iarg], "--kokkos-tools-args", args)) {
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+  using Kokkos::Impl::check_arg_str;
+
+  auto& libs = arguments.lib;
+  auto& args = arguments.args;
+  auto& help = arguments.help;
+  while (iarg < argc) {
+    bool remove_flag = false;
+    if (check_arg_str(argv[iarg], "--kokkos-tools-libs", libs) ||
+        check_arg_str(argv[iarg], "--kokkos-tools-library", libs)) {
+      if (check_arg(argv[iarg], "--kokkos-tools-library")) {
+        using Kokkos::Impl::warn_deprecated_command_line_argument;
+        warn_deprecated_command_line_argument("--kokkos-tools-library",
+                                              "--kokkos-tools-libs");
       }
-      narg--;
+      warn_cmd_line_arg_ignored_when_kokkos_tools_disabled(argv[iarg]);
+      remove_flag = true;
+    } else if (check_arg_str(argv[iarg], "--kokkos-tools-args", args)) {
+      warn_cmd_line_arg_ignored_when_kokkos_tools_disabled(argv[iarg]);
+      remove_flag = true;
       // strip any leading and/or trailing quotes if they were retained in the
       // string because this will very likely cause parsing issues for tools.
       // If the quotes are retained (via bypassing the shell):
@@ -118,56 +145,71 @@ void parse_command_line_arguments(int& narg, char* arg[],
         if (args.back() == '"') args = args.substr(0, args.length() - 1);
       }
       // add the name of the executable to the beginning
-      if (narg > 0) args = std::string(arg[0]) + " " + args;
-    } else if (check_arg(arg[iarg], "--kokkos-tools-help")) {
+      if (argc > 0) args = std::string(argv[0]) + " " + args;
+    } else if (check_arg(argv[iarg], "--kokkos-tools-help")) {
       help = InitArguments::PossiblyUnsetOption::on;
-      for (int k = iarg; k < narg - 1; k++) {
-        arg[k] = arg[k + 1];
+      warn_cmd_line_arg_ignored_when_kokkos_tools_disabled(argv[iarg]);
+      remove_flag = true;
+    } else if (std::regex_match(argv[iarg], std::regex("-?-kokkos-tool.*",
+                                                       std::regex::egrep))) {
+      std::cerr << "Warning: command line argument '" << argv[iarg]
+                << "' is not recognized."
+                << " Raised by Kokkos::initialize()." << std::endl;
+    }
+    if (remove_flag) {
+      // Shift the remainder of the argv list by one.  Note that argv has
+      // (argc + 1) arguments, the last one always being nullptr.  The following
+      // loop moves the trailing nullptr element as well
+      for (int k = iarg; k < argc; ++k) {
+        argv[k] = argv[k + 1];
       }
-      narg--;
+      argc--;
     } else {
       iarg++;
     }
-    if ((args == Kokkos::Tools::InitArguments::unset_string_option) && narg > 0)
-      args = arg[0];
+    if ((args == Kokkos::Tools::InitArguments::unset_string_option) && argc > 0)
+      args = argv[0];
   }
 }
 Kokkos::Tools::Impl::InitializationStatus parse_environment_variables(
     InitArguments& arguments) {
-  auto& tool_lib       = arguments.lib;
-  auto& tune_internals = arguments.tune_internals;
-  auto env_tool_lib    = std::getenv("KOKKOS_PROFILE_LIBRARY");
-  if (env_tool_lib != nullptr) {
-    if ((tool_lib != Kokkos::Tools::InitArguments::unset_string_option) &&
-        std::string(env_tool_lib) != tool_lib)
-      return {Kokkos::Tools::Impl::InitializationStatus::InitializationResult::
-                  environment_argument_mismatch,
-              "Error: expecting a match between --kokkos-tools-library and "
-              "KOKKOS_PROFILE_LIBRARY if both are set. Raised by "
-              "Kokkos::initialize(int narg, char* argc[])."};
-    else
-      tool_lib = env_tool_lib;
+  auto& libs               = arguments.lib;
+  auto& args               = arguments.args;
+  auto env_profile_library = std::getenv("KOKKOS_PROFILE_LIBRARY");
+  if (env_profile_library != nullptr) {
+    using Kokkos::Impl::warn_deprecated_environment_variable;
+    warn_deprecated_environment_variable("KOKKOS_PROFILE_LIBRARY",
+                                         "KOKKOS_TOOLS_LIBS");
+    warn_env_var_ignored_when_kokkos_tools_disabled("KOKKOS_PROFILE_LIBRARY",
+                                                    env_profile_library);
+    libs = env_profile_library;
   }
-  char* env_tuneinternals_str = std::getenv("KOKKOS_TUNE_INTERNALS");
-  if (env_tuneinternals_str != nullptr) {
-    std::string env_str(env_tuneinternals_str);  // deep-copies string
-    for (char& c : env_str) {
-      c = toupper(c);
+  auto env_tools_libs = std::getenv("KOKKOS_TOOLS_LIBS");
+  if (env_tools_libs != nullptr) {
+    warn_env_var_ignored_when_kokkos_tools_disabled("KOKKOS_TOOLS_LIBS",
+                                                    env_tools_libs);
+    if (env_profile_library != nullptr && libs != env_tools_libs) {
+      std::stringstream ss;
+      ss << "Error: environment variables 'KOKKOS_PROFILE_LIBRARY="
+         << env_profile_library << "' and 'KOKKOS_TOOLS_LIBS=" << env_tools_libs
+         << "' are both set and do not match."
+         << " Raised by Kokkos::initialize().\n";
+      Kokkos::abort(ss.str().c_str());
     }
-    if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1"))
-      tune_internals = InitArguments::PossiblyUnsetOption::on;
-    else if (tune_internals)
-      return {Kokkos::Tools::Impl::InitializationStatus::InitializationResult::
-                  environment_argument_mismatch,
-              "Error: expecting a match between --kokkos-tune-internals and "
-              "KOKKOS_TUNE_INTERNALS if both are set. Raised by "
-              "Kokkos::initialize(int narg, char* argc[])."};
+    libs = env_tools_libs;
+  }
+  auto env_tools_args = std::getenv("KOKKOS_TOOLS_ARGS");
+  if (env_tools_args != nullptr) {
+    warn_env_var_ignored_when_kokkos_tools_disabled("KOKKOS_TOOLS_ARGS",
+                                                    env_tools_args);
+    args = env_tools_args;
   }
   return {
       Kokkos::Tools::Impl::InitializationStatus::InitializationResult::success};
 }
 InitializationStatus initialize_tools_subsystem(
     const Kokkos::Tools::InitArguments& args) {
+#ifdef KOKKOS_TOOLS_ENABLE_LIBDL
   Kokkos::Profiling::initialize(args.lib);
   auto final_args =
       (args.args != Kokkos::Tools::InitArguments::unset_string_option)
@@ -181,6 +223,9 @@ InitializationStatus initialize_tools_subsystem(
     return {InitializationStatus::InitializationResult::help_request};
   }
   Kokkos::Tools::parseArgs(final_args);
+#else
+  (void)args;
+#endif
   return {InitializationStatus::InitializationResult::success};
 }
 
@@ -190,8 +235,8 @@ void initialize(const InitArguments& arguments) {
 }
 void initialize(int argc, char* argv[]) {
   InitArguments arguments;
-  Impl::parse_command_line_arguments(argc, argv, arguments);
   Impl::parse_environment_variables(arguments);
+  Impl::parse_command_line_arguments(argc, argv, arguments);
   initialize(arguments);
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 4a8527f5e..cb17a0cd8 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -56,6 +56,7 @@
 namespace Kokkos {
 
 // forward declaration
+bool show_warnings() noexcept;
 bool tune_internals() noexcept;
 
 namespace Tools {
@@ -66,10 +67,9 @@ struct InitArguments {
   // for this long-term
   static const std::string unset_string_option;
   enum PossiblyUnsetOption { unset, off, on };
-  PossiblyUnsetOption tune_internals = unset;
-  PossiblyUnsetOption help           = unset;
-  std::string lib                    = unset_string_option;
-  std::string args                   = unset_string_option;
+  PossiblyUnsetOption help = unset;
+  std::string lib          = unset_string_option;
+  std::string args         = unset_string_option;
 };
 
 namespace Impl {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index d52668205..428a3cb17 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -74,6 +74,7 @@ enum struct DeviceType {
   HPX,
   Threads,
   SYCL,
+  OpenACC,
   Unknown
 };
 
@@ -98,6 +99,7 @@ inline DeviceType devicetype_from_uint32t(const uint32_t in) {
     case 5: return DeviceType::HPX;
     case 6: return DeviceType::Threads;
     case 7: return DeviceType::SYCL;
+    case 8: return DeviceType::OpenACC;
     default: return DeviceType::Unknown;  // TODO: error out?
   }
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
index b67cede45..c7936e950 100644
--- a/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
@@ -50,6 +50,8 @@
 #if defined(KOKKOS_ENABLE_LIBQUADMATH)
 
 #include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_MathematicalConstants.hpp>
+#include <Kokkos_MathematicalFunctions.hpp>
 
 #include <quadmath.h>
 
@@ -88,7 +90,14 @@ KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_max,     __float128, __float128, FLT
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(epsilon,        __float128, __float128, FLT128_EPSILON)
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(round_error,    __float128, __float128, static_cast<__float128>(0.5))
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(norm_min,       __float128, __float128, FLT128_MIN)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(denorm_min,     __float128, __float128, FLT128_DENORM_MIN)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(reciprocal_overflow_threshold, __float128, __float128, FLT128_MIN)
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 710)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(quiet_NaN,      __float128, __float128, __builtin_nanq(""))
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(signaling_NaN,  __float128, __float128, __builtin_nansq(""))
+#endif
 
+// Numeric characteristics traits
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits,         __float128,        int, FLT128_MANT_DIG)
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits10,       __float128,        int, FLT128_DIG)
 KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_digits10,   __float128,        int, 36)
@@ -124,21 +133,21 @@ struct reduction_identity<__float128> {
 
 //<editor-fold desc="Common mathematical functions __float128 overloads">
 namespace Kokkos {
-namespace Experimental {
 // clang-format off
+namespace Impl {
+template <> struct promote<__float128> { using type = __float128; };
+}
 // Basic operations
+inline __float128 abs(__float128 x) { return ::fabsq(x); }
 inline __float128 fabs(__float128 x) { return ::fabsq(x); }
 inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); }
 inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); }
-inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
+// remquo
+// fma
 inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); }
+inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
 inline __float128 fdim(__float128 x, __float128 y) { return ::fdimq(x, y); }
 inline __float128 nanq(char const* arg) { return ::nanq(arg); }
-// Power functions
-inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
-inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
-inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
-inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
 // Exponential functions
 inline __float128 exp(__float128 x) { return ::expq(x); }
 #if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 910)
@@ -149,6 +158,11 @@ inline __float128 log(__float128 x) { return ::logq(x); }
 inline __float128 log10(__float128 x) { return ::log10q(x); }
 inline __float128 log2(__float128 x) { return ::log2q(x); }
 inline __float128 log1p(__float128 x) { return ::log1pq(x); }
+// Power functions
+inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
+inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
+inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
+inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
 // Trigonometric functions
 inline __float128 sin(__float128 x) { return ::sinq(x); }
 inline __float128 cos(__float128 x) { return ::cosq(x); }
@@ -173,11 +187,61 @@ inline __float128 lgamma(__float128 x) { return ::lgammaq(x); }
 inline __float128 ceil(__float128 x) { return ::ceilq(x); }
 inline __float128 floor(__float128 x) { return ::floorq(x); }
 inline __float128 trunc(__float128 x) { return ::truncq(x); }
+inline __float128 round(__float128 x) { return ::roundq(x); }
+// lround
+// llround
 inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); }
+// rint
+// lrint
+// llrint
+// Floating point manipulation functions
+// frexp
+// ldexp
+// modf
+// scalbn
+// scalbln
+// ilog
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 610)
+inline __float128 logb(__float128 x) { return ::logbq(x); }
+#endif
+inline __float128 nextafter(__float128 x, __float128 y) { return ::nextafterq(x, y); }
+// nexttoward
+inline __float128 copysign(__float128 x, __float128 y) { return ::copysignq(x, y); }
 // Classification and comparison
+// fpclassify
 inline bool isfinite(__float128 x) { return !::isinfq(x); }  // isfiniteq not provided
 inline bool isinf(__float128 x) { return ::isinfq(x); }
 inline bool isnan(__float128 x) { return ::isnanq(x); }
+// isnormal
+inline bool signbit(__float128 x) { return ::signbitq(x); }
+// isgreater
+// isgreaterequal
+// isless
+// islessequal
+// islessgreater
+// isunordered
+// clang-format on
+}  // namespace Kokkos
+//</editor-fold>
+
+//<editor-fold desc="Mathematical constants __float128 specializations">
+namespace Kokkos {
+namespace Experimental {
+// clang-format off
+template <> constexpr __float128 e_v         <__float128> = 2.718281828459045235360287471352662498Q;
+template <> constexpr __float128 log2e_v     <__float128> = 1.442695040888963407359924681001892137Q;
+template <> constexpr __float128 log10e_v    <__float128> = 0.434294481903251827651128918916605082Q;
+template <> constexpr __float128 pi_v        <__float128> = 3.141592653589793238462643383279502884Q;
+template <> constexpr __float128 inv_pi_v    <__float128> = 0.318309886183790671537767526745028724Q;
+template <> constexpr __float128 inv_sqrtpi_v<__float128> = 0.564189583547756286948079451560772586Q;
+template <> constexpr __float128 ln2_v       <__float128> = 0.693147180559945309417232121458176568Q;
+template <> constexpr __float128 ln10_v      <__float128> = 2.302585092994045684017991454684364208Q;
+template <> constexpr __float128 sqrt2_v     <__float128> = 1.414213562373095048801688724209698079Q;
+template <> constexpr __float128 sqrt3_v     <__float128> = 1.732050807568877293527446341505872367Q;
+template <> constexpr __float128 inv_sqrt3_v <__float128> = 0.577350269189625764509148780501957456Q;
+template <> constexpr __float128 egamma_v    <__float128> = 0.577215664901532860606512090082402431Q;
+template <> constexpr __float128 phi_v       <__float128> = 1.618033988749894848204586834365638118Q;
+// clang-format on
 }  // namespace Experimental
 }  // namespace Kokkos
 //</editor-fold>
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index 149c881af..aff6332cc 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -42,13 +42,16 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Core.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
-KOKKOS_THREAD_LOCAL int SharedAllocationRecord<void, void>::t_tracking_enabled =
-    1;
+thread_local int SharedAllocationRecord<void, void>::t_tracking_enabled = 1;
 
 #ifdef KOKKOS_ENABLE_DEBUG
 bool SharedAllocationRecord<void, void>::is_sane(
@@ -238,7 +241,7 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<
   const int old_count = Kokkos::atomic_fetch_sub(&arg_record->m_count, 1);
 
   if (old_count == 1) {
-    if (!Kokkos::is_initialized()) {
+    if (is_finalized()) {
       std::stringstream ss;
       ss << "Kokkos allocation \"";
       ss << arg_record->get_label();
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
index 2f18157ff..02dcd1ec6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -86,9 +86,9 @@ class SharedAllocationHeader {
  public:
   /* Given user memory get pointer to the header */
   KOKKOS_INLINE_FUNCTION static const SharedAllocationHeader* get_header(
-      void* alloc_ptr) {
-    return reinterpret_cast<SharedAllocationHeader*>(
-        reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader));
+      void const* alloc_ptr) {
+    return reinterpret_cast<SharedAllocationHeader const*>(
+        static_cast<char const*>(alloc_ptr) - sizeof(SharedAllocationHeader));
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -141,15 +141,22 @@ class SharedAllocationRecord<void, void> {
       SharedAllocationHeader* arg_alloc_ptr, size_t arg_alloc_size,
       function_type arg_dealloc, const std::string& label);
  private:
-  static KOKKOS_THREAD_LOCAL int t_tracking_enabled;
+  static thread_local int t_tracking_enabled;
 
  public:
   virtual std::string get_label() const { return std::string("Unmanaged"); }
 
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma push
+#pragma diag_suppress implicit_return_from_non_void_function
+#endif
   static KOKKOS_FUNCTION int tracking_enabled() {
     KOKKOS_IF_ON_HOST(return t_tracking_enabled;)
     KOKKOS_IF_ON_DEVICE(return 0;)
   }
+#if defined(__EDG__) && !defined(KOKKOS_COMPILER_INTEL)
+#pragma pop
+#endif
 
   /**\brief A host process thread claims and disables the
    *        shared allocation tracking flag.
@@ -185,7 +192,7 @@ class SharedAllocationRecord<void, void> {
 
   /* User's memory begins at the end of the header */
   KOKKOS_INLINE_FUNCTION
-  void* data() const { return reinterpret_cast<void*>(m_alloc_ptr + 1); }
+  void* data() const { return static_cast<void*>(m_alloc_ptr + 1); }
 
   /* User's memory begins at the end of the header */
   size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader); }
@@ -302,6 +309,16 @@ template <class MemorySpace, class DestroyFunctor>
 class SharedAllocationRecord
     : public SharedAllocationRecord<MemorySpace, void> {
  private:
+  template <typename ExecutionSpace>
+  SharedAllocationRecord(const ExecutionSpace& execution_space,
+                         const MemorySpace& arg_space,
+                         const std::string& arg_label, const size_t arg_alloc)
+      /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
+      : SharedAllocationRecord<MemorySpace, void>(
+            execution_space, arg_space, arg_label, arg_alloc,
+            &Kokkos::Impl::deallocate<MemorySpace, DestroyFunctor>),
+        m_destroy() {}
+
   SharedAllocationRecord(const MemorySpace& arg_space,
                          const std::string& arg_label, const size_t arg_alloc)
       /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
@@ -328,6 +345,17 @@ class SharedAllocationRecord
     KOKKOS_IF_ON_DEVICE(
         ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;))
   }
+
+  template <typename ExecutionSpace>
+  KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate(
+      const ExecutionSpace& exec_space, const MemorySpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc) {
+    KOKKOS_IF_ON_HOST(
+        (return new SharedAllocationRecord(exec_space, arg_space, arg_label,
+                                           arg_alloc);))
+    KOKKOS_IF_ON_DEVICE(((void)exec_space; (void)arg_space; (void)arg_label;
+                         (void)arg_alloc; return nullptr;))
+  }
 };
 
 template <class MemorySpace>
diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 7f222c92c..06bfe276c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -153,8 +153,7 @@ class SimpleTaskScheduler
   }
 
   template <int TaskEnum, class DepTaskType, class FunctorType>
-  KOKKOS_FUNCTION future_type_for_functor<
-      typename std::decay<FunctorType>::type>
+  KOKKOS_FUNCTION future_type_for_functor<std::decay_t<FunctorType>>
   _spawn_impl(
       DepTaskType arg_predecessor_task, TaskPriority arg_priority,
       typename runnable_task_base_type::function_type apply_function_ptr,
@@ -163,7 +162,7 @@ class SimpleTaskScheduler
     KOKKOS_EXPECTS(m_queue != nullptr);
 
     using functor_future_type =
-        future_type_for_functor<typename std::decay<FunctorType>::type>;
+        future_type_for_functor<std::decay_t<FunctorType>>;
     using task_type =
         typename task_queue_type::template runnable_task_type<FunctorType,
                                                               scheduler_type>;
@@ -221,7 +220,7 @@ class SimpleTaskScheduler
     // SharedAllocationRecord pattern
     using record_type =
         Impl::SharedAllocationRecord<memory_space,
-                                     Impl::DefaultDestroy<task_queue_type> >;
+                                     Impl::DefaultDestroy<task_queue_type>>;
 
     // Allocate space for the task queue
     auto* record = record_type::allocate(memory_space(), "Kokkos::TaskQueue",
diff --git a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
index 0584cd29e..aa84fbbf6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
@@ -64,7 +64,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
index f46d89226..d0954291f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <Kokkos_Macros.hpp>
 
 #include <Kokkos_Atomic.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp
index 1c65fb91f..085157521 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@@ -66,8 +66,8 @@ enum class WaitMode : int {
 void host_thread_yield(const uint32_t i, const WaitMode mode);
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-root_spinwait_while_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_while_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value == flag) {
@@ -77,8 +77,8 @@ root_spinwait_while_equal(T const volatile& flag, const T value) {
 }
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-root_spinwait_until_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_until_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value != flag) {
@@ -88,8 +88,8 @@ root_spinwait_until_equal(T const volatile& flag, const T value) {
 }
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-spinwait_while_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> spinwait_while_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value == flag) {
@@ -99,8 +99,8 @@ spinwait_while_equal(T const volatile& flag, const T value) {
 }
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-yield_while_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> yield_while_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value == flag) {
@@ -110,8 +110,8 @@ yield_while_equal(T const volatile& flag, const T value) {
 }
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-spinwait_until_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> spinwait_until_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value != flag) {
@@ -121,8 +121,8 @@ spinwait_until_equal(T const volatile& flag, const T value) {
 }
 
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value, void>::type
-yield_until_equal(T const volatile& flag, const T value) {
+std::enable_if_t<std::is_integral<T>::value, void> yield_until_equal(
+    T const volatile& flag, const T value) {
   Kokkos::store_fence();
   uint32_t i = 0;
   while (value != flag) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
index c0c1fdf6b..e1f59c1d8 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp
@@ -1,3 +1,8 @@
+
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include "Kokkos_Macros.hpp"
 #include "Kokkos_Stacktrace.hpp"
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
new file mode 100644
index 000000000..644dcf7fa
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp
@@ -0,0 +1,220 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STRING_MANIPULATION_HPP
+#define KOKKOS_STRING_MANIPULATION_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstddef>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// This header provides a subset of the functionality from <cstring>.  In
+// contrast to the standard library header, functions are usable on the device
+// and in constant expressions.  It also includes functionality from <charconv>
+// to convert an integer value to a character sequence.
+
+//<editor-fold desc="String examination">
+// returns the length of a given string
+KOKKOS_INLINE_FUNCTION constexpr std::size_t strlen(const char *str) {
+  std::size_t i = 0;
+  while (str[i] != '\0') {
+    ++i;
+  }
+  return i;
+}
+
+// compares two strings
+KOKKOS_INLINE_FUNCTION constexpr int strcmp(const char *lhs, const char *rhs) {
+  while (*lhs == *rhs++) {
+    if (*lhs++ == '\0') {
+      return 0;
+    }
+  }
+  return static_cast<unsigned int>(*lhs) -
+         static_cast<unsigned int>(*(rhs - 1));
+}
+
+// compares a certain number of characters from two strings
+KOKKOS_INLINE_FUNCTION constexpr int strncmp(const char *lhs, const char *rhs,
+                                             std::size_t count) {
+  for (std::size_t i = 0; i < count; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return lhs[i] < rhs[i] ? -1 : 1;
+    } else if (lhs[i] == '\0') {
+      return 0;
+    }
+  }
+  return 0;
+}
+//</editor-fold>
+
+//<editor-fold desc="String manipulation">
+// copies one string to another
+KOKKOS_INLINE_FUNCTION constexpr char *strcpy(char *dest, const char *src) {
+  char *d = dest;
+  for (; (*d = *src) != '\0'; ++d, ++src) {
+  }
+  return dest;
+}
+
+// copies a certain amount of characters from one string to another
+KOKKOS_INLINE_FUNCTION constexpr char *strncpy(char *dest, const char *src,
+                                               std::size_t count) {
+  if (count != 0) {
+    char *d = dest;
+    do {
+      if ((*d++ = *src++) == '\0') {
+        while (--count != 0) {
+          *d++ = '\0';
+        }
+        break;
+      }
+    } while (--count != 0);
+  }
+  return dest;
+}
+
+// concatenates two strings
+KOKKOS_INLINE_FUNCTION constexpr char *strcat(char *dest, const char *src) {
+  char *d = dest;
+  for (; *d != '\0'; ++d) {
+  }
+  while ((*d++ = *src++) != '\0') {
+  }
+  return dest;
+}
+
+// concatenates a certain amount of characters of two strings
+KOKKOS_INLINE_FUNCTION constexpr char *strncat(char *dest, const char *src,
+                                               std::size_t count) {
+  if (count != 0) {
+    char *d = dest;
+    for (; *d != '\0'; ++d) {
+    }
+    do {
+      if ((*d = *src++) == '\0') {
+        break;
+      }
+      d++;
+    } while (--count != 0);
+    *d = '\0';
+  }
+  return dest;
+}
+//</editor-fold>
+
+//<editor-fold desc="Character conversions">
+template <class Unsigned>
+KOKKOS_FUNCTION constexpr unsigned int to_chars_len(Unsigned val) {
+  unsigned int const base = 10;
+  static_assert(std::is_integral<Unsigned>::value, "implementation bug");
+  static_assert(std::is_unsigned<Unsigned>::value, "implementation bug");
+  unsigned int n = 1;
+  while (val >= base) {
+    val /= base;
+    ++n;
+  }
+  return n;
+}
+template <class Unsigned>
+KOKKOS_FUNCTION constexpr void to_chars_impl(char *first, unsigned int len,
+                                             Unsigned val) {
+  unsigned int const base = 10;
+  static_assert(std::is_integral<Unsigned>::value, "implementation bug");
+  static_assert(std::is_unsigned<Unsigned>::value, "implementation bug");
+  unsigned int pos = len - 1;
+  while (val > 0) {
+    auto const num = val % base;
+    val /= base;
+    first[pos] = '0' + num;
+    --pos;
+  }
+}
+
+// define values of portable error conditions that correspond to the POSIX error
+// codes
+enum class errc {
+  value_too_large = 75  // equivalent POSIX error is EOVERFLOW
+};
+struct to_chars_result {
+  char *ptr;
+  errc ec;
+};
+
+// converts an integer value to a character sequence
+template <class Integral>
+KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last,
+                                                     Integral value) {
+  using Unsigned = std::conditional_t<sizeof(Integral) <= sizeof(unsigned int),
+                                      unsigned int, unsigned long long>;
+  Unsigned unsigned_val = value;
+  if (value == 0) {
+    *first = '0';
+    return {first + 1, {}};
+  } else if
+#ifdef KOKKOS_ENABLE_CXX17
+      constexpr
+#endif
+      (std::is_signed<Integral>::value) {
+    if (value < 0) {
+      *first++     = '-';
+      unsigned_val = Unsigned(~value) + Unsigned(1);
+    }
+  }
+  unsigned int const len = to_chars_len(unsigned_val);
+  if (last - first < len) {
+    return {last, errc::value_too_large};
+  }
+  to_chars_impl(first, len, unsigned_val);
+  return {first + len, {}};
+}
+//</editor-fold>
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index e9f5d91aa..bb89ab914 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -57,7 +57,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -199,7 +198,7 @@ class TaskBase {
   void add_dependence(TaskBase* dep) {
     // Precondition: lock == m_next
 
-    TaskBase* const lock = (TaskBase*)LockTag;
+    auto* const lock = reinterpret_cast<TaskBase*>(LockTag);
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
@@ -221,7 +220,7 @@ class TaskBase {
 
   KOKKOS_INLINE_FUNCTION
   int32_t reference_count() const {
-    return *((int32_t volatile*)(&m_ref_count));
+    return *const_cast<int32_t volatile*>(&m_ref_count);
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 7cfd696d2..5e2ebb058 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -63,7 +63,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index e74e84a2e..4f565f019 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -64,7 +64,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index 757e5f988..82af5625e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -63,7 +63,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index 3a71aa17e..c8039fa77 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -62,7 +62,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index 5f98e8d85..31c737650 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -63,7 +63,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 #include <cassert>
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp
index 40a9c3bf5..7c893547d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp
@@ -58,7 +58,6 @@
 
 #include <string>
 #include <typeinfo>
-#include <stdexcept>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
index f53dfe5a9..1d6c766a7 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
@@ -80,8 +80,9 @@ class TaskTeamMemberAdapter : public TeamMember {
   // type that we're adapting
   template <typename... Args>
   KOKKOS_INLINE_FUNCTION explicit TaskTeamMemberAdapter(
-      typename std::enable_if<std::is_constructible<TeamMember, Args...>::value,
-                              Scheduler>::type arg_scheduler,
+      std::enable_if_t<std::is_constructible<TeamMember, Args...>::value,
+                       Scheduler>
+          arg_scheduler,
       Args&&... args)  // TODO @tasking @minor DSH noexcept specification
       : TeamMember(std::forward<Args>(args)...),
         m_scheduler(
diff --git a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
index e734b369b..702fc0997 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp
@@ -179,9 +179,8 @@ void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy,
   if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
-      using policy_type =
-          typename std::remove_reference<decltype(policy)>::type;
-      using work_tag = typename policy_type::work_tag;
+      using policy_type = std::remove_reference_t<decltype(policy)>;
+      using work_tag    = typename policy_type::work_tag;
       Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label);
       label = name.get();
     }
@@ -205,9 +204,8 @@ void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy,
   if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
-      using policy_type =
-          typename std::remove_reference<decltype(policy)>::type;
-      using work_tag = typename policy_type::work_tag;
+      using policy_type = std::remove_reference_t<decltype(policy)>;
+      using work_tag    = typename policy_type::work_tag;
       Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label);
       label = name.get();
     }
@@ -312,9 +310,8 @@ void generic_report_results(const std::string& label_in, Map& map,
   if (should_tune(policy)) {
     std::string label = label_in;
     if (label_in.empty()) {
-      using policy_type =
-          typename std::remove_reference<decltype(policy)>::type;
-      using work_tag = typename policy_type::work_tag;
+      using policy_type = std::remove_reference_t<decltype(policy)>;
+      using work_tag    = typename policy_type::work_tag;
       Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label);
       label = name.get();
     }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
index aa38388ac..38edc118d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -115,8 +115,7 @@ struct has_condition<DefaultType, Condition, S, Pack...> {
  public:
   enum : bool { value = self_value || next::value };
 
-  using type =
-      typename std::conditional<self_value, S, typename next::type>::type;
+  using type = std::conditional_t<self_value, S, typename next::type>;
 };
 
 template <class... Args>
@@ -156,10 +155,9 @@ struct if_c {
 
   using type = FalseType;
 
-  using value_type = typename std::remove_const<
-      typename std::remove_reference<type>::type>::type;
+  using value_type = std::remove_const_t<std::remove_reference_t<type>>;
 
-  using const_value_type = typename std::add_const<value_type>::type;
+  using const_value_type = std::add_const_t<value_type>;
 
   static KOKKOS_INLINE_FUNCTION const_value_type& select(const_value_type& v) {
     return v;
@@ -191,10 +189,9 @@ struct if_c<true, TrueType, FalseType> {
 
   using type = TrueType;
 
-  using value_type = typename std::remove_const<
-      typename std::remove_reference<type>::type>::type;
+  using value_type = std::remove_const_t<std::remove_reference_t<type>>;
 
-  using const_value_type = typename std::add_const<value_type>::type;
+  using const_value_type = std::add_const_t<value_type>;
 
   static KOKKOS_INLINE_FUNCTION const_value_type& select(const_value_type& v) {
     return v;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
index bea7c2c9d..37b74103d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -65,6 +65,9 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
+template <typename... Is>
+struct always_true : std::true_type {};
+
 #if defined(__cpp_lib_void_t)
 // since C++17
 using std::void_t;
diff --git a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
index ace826dd5..677326334 100644
--- a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
@@ -117,7 +117,7 @@ struct ObjectWithVLAEmulation {
   using vla_entry_count_type = EntryCountType;
 
   using iterator       = VLAValueType*;
-  using const_iterator = typename std::add_const<VLAValueType>::type*;
+  using const_iterator = std::add_const_t<VLAValueType>*;
 
   // TODO @tasking @minor DSH require that Derived be marked final? (note that
   // std::is_final is C++14)
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
index fbda3e093..12a5fa288 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
@@ -55,7 +55,7 @@ struct ViewDataAnalysis<DataType, ArrayLayout, Kokkos::Array<V, N, P>> {
  private:
   using array_analysis = ViewArrayAnalysis<DataType>;
 
-  static_assert(std::is_same<P, void>::value, "");
+  static_assert(std::is_void<P>::value, "");
   static_assert(std::is_same<typename array_analysis::non_const_value_type,
                              Kokkos::Array<V, N, P>>::value,
                 "");
@@ -75,7 +75,7 @@ struct ViewDataAnalysis<DataType, ArrayLayout, Kokkos::Array<V, N, P>> {
 
   using array_scalar_dimension = typename dimension::template append<N>::type;
 
-  using scalar_type = typename std::conditional<is_const, const V, V>::type;
+  using scalar_type           = std::conditional_t<is_const, const V, V>;
   using non_const_scalar_type = V;
   using const_scalar_type     = const V;
 
@@ -230,8 +230,8 @@ class ViewMapping<Traits, Kokkos::Array<>> {
   }
 
   using reference_type =
-      typename std::conditional<is_contiguous_reference, contiguous_reference,
-                                strided_reference>::type;
+      std::conditional_t<is_contiguous_reference, contiguous_reference,
+                         strided_reference>;
 
   using pointer_type = handle_type;
 
@@ -350,7 +350,8 @@ class ViewMapping<Traits, Kokkos::Array<>> {
   template <class... P>
   Kokkos::Impl::SharedAllocationRecord<> *allocate_shared(
       Kokkos::Impl::ViewCtorProp<P...> const &arg_prop,
-      typename Traits::array_layout const &arg_layout) {
+      typename Traits::array_layout const &arg_layout,
+      bool execution_space_specified) {
     using alloc_prop = Kokkos::Impl::ViewCtorProp<P...>;
 
     using execution_space = typename alloc_prop::execution_space;
@@ -373,12 +374,21 @@ class ViewMapping<Traits, Kokkos::Array<>> {
         static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const &>(
             arg_prop)
             .value;
-    // Allocate memory from the memory space and create tracking record.
-    record_type *const record = record_type::allocate(
+    const execution_space &exec_space =
+        static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const &>(
+            arg_prop)
+            .value;
+    const memory_space &mem_space =
         static_cast<Kokkos::Impl::ViewCtorProp<void, memory_space> const &>(
             arg_prop)
-            .value,
-        alloc_name, alloc_size);
+            .value;
+
+    // Allocate memory from the memory space and create tracking record.
+    record_type *const record =
+        execution_space_specified
+            ? record_type::allocate(exec_space, mem_space, alloc_name,
+                                    alloc_size)
+            : record_type::allocate(mem_space, alloc_name, alloc_size);
 
     if (alloc_size) {
       m_impl_handle =
@@ -386,12 +396,12 @@ class ViewMapping<Traits, Kokkos::Array<>> {
 
       if (alloc_prop::initialize) {
         // The functor constructs and destroys
-        record->m_destroy = functor_type(
-            static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const
-                            &>(arg_prop)
-                .value,
-            (pointer_type)m_impl_handle, m_impl_offset.span() * Array_N,
-            alloc_name);
+        record->m_destroy =
+            execution_space_specified
+                ? functor_type(exec_space, (pointer_type)m_impl_handle,
+                               m_impl_offset.span() * Array_N, alloc_name)
+                : functor_type((pointer_type)m_impl_handle,
+                               m_impl_offset.span() * Array_N, alloc_name);
 
         record->m_destroy.construct_shared_allocation();
       }
@@ -406,10 +416,10 @@ class ViewMapping<Traits, Kokkos::Array<>> {
 template <class DstTraits, class SrcTraits>
 class ViewMapping<
     DstTraits, SrcTraits,
-    typename std::enable_if<(
+    std::enable_if_t<(
         std::is_same<typename DstTraits::memory_space,
                      typename SrcTraits::memory_space>::value &&
-        std::is_same<typename DstTraits::specialize, void>::value &&
+        std::is_void<typename DstTraits::specialize>::value &&
         (std::is_same<typename DstTraits::array_layout,
                       Kokkos::LayoutLeft>::value ||
          std::is_same<typename DstTraits::array_layout,
@@ -422,7 +432,7 @@ class ViewMapping<
          std::is_same<typename SrcTraits::array_layout,
                       Kokkos::LayoutRight>::value ||
          std::is_same<typename SrcTraits::array_layout,
-                      Kokkos::LayoutStride>::value))>::type> {
+                      Kokkos::LayoutStride>::value))>> {
  public:
   // Can only convert to View::array_type
 
@@ -506,14 +516,14 @@ class ViewMapping<
 
 template <class SrcTraits, class... Args>
 class ViewMapping<
-    typename std::enable_if<(
+    std::enable_if_t<(
         std::is_same<typename SrcTraits::specialize, Kokkos::Array<>>::value &&
         (std::is_same<typename SrcTraits::array_layout,
                       Kokkos::LayoutLeft>::value ||
          std::is_same<typename SrcTraits::array_layout,
                       Kokkos::LayoutRight>::value ||
          std::is_same<typename SrcTraits::array_layout,
-                      Kokkos::LayoutStride>::value))>::type,
+                      Kokkos::LayoutStride>::value))>,
     SrcTraits, Args...> {
  private:
   static_assert(SrcTraits::rank == sizeof...(Args), "");
@@ -558,36 +568,34 @@ class ViewMapping<
 
   // Subview's layout
   using array_layout =
-      typename std::conditional<((rank == 0) ||
-                                 (rank <= 2 && R0 &&
-                                  std::is_same<typename SrcTraits::array_layout,
-                                               Kokkos::LayoutLeft>::value) ||
-                                 (rank <= 2 && R0_rev &&
-                                  std::is_same<typename SrcTraits::array_layout,
-                                               Kokkos::LayoutRight>::value)),
-                                typename SrcTraits::array_layout,
-                                Kokkos::LayoutStride>::type;
+      std::conditional_t<((rank == 0) ||
+                          (rank <= 2 && R0 &&
+                           std::is_same<typename SrcTraits::array_layout,
+                                        Kokkos::LayoutLeft>::value) ||
+                          (rank <= 2 && R0_rev &&
+                           std::is_same<typename SrcTraits::array_layout,
+                                        Kokkos::LayoutRight>::value)),
+                         typename SrcTraits::array_layout,
+                         Kokkos::LayoutStride>;
 
   using value_type = typename SrcTraits::value_type;
 
-  using data_type = typename std::conditional<
+  using data_type = std::conditional_t<
       rank == 0, value_type,
-      typename std::conditional<
+      std::conditional_t<
           rank == 1, value_type *,
-          typename std::conditional<
+          std::conditional_t<
               rank == 2, value_type **,
-              typename std::conditional<
+              std::conditional_t<
                   rank == 3, value_type ***,
-                  typename std::conditional<
+                  std::conditional_t<
                       rank == 4, value_type ****,
-                      typename std::conditional<
+                      std::conditional_t<
                           rank == 5, value_type *****,
-                          typename std::conditional<
+                          std::conditional_t<
                               rank == 6, value_type ******,
-                              typename std::conditional<
-                                  rank == 7, value_type *******,
-                                  value_type ********>::type>::type>::type>::
-                      type>::type>::type>::type>::type;
+                              std::conditional_t<rank == 7, value_type *******,
+                                                 value_type ********>>>>>>>>;
 
  public:
   using traits_type = Kokkos::ViewTraits<data_type, array_layout,
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index cc3953c05..8bc8f8686 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -53,7 +53,6 @@ namespace Impl {
 
 struct WithoutInitializing_t {};
 struct AllowPadding_t {};
-struct NullSpace_t {};
 
 template <typename>
 struct is_view_ctor_property : public std::false_type {};
@@ -64,9 +63,6 @@ struct is_view_ctor_property<WithoutInitializing_t> : public std::true_type {};
 template <>
 struct is_view_ctor_property<AllowPadding_t> : public std::true_type {};
 
-template <>
-struct is_view_ctor_property<NullSpace_t> : public std::true_type {};
-
 //----------------------------------------------------------------------------
 /**\brief Whether a type can be used for a view label */
 
@@ -91,10 +87,15 @@ struct ViewCtorProp;
 template <typename Specialize, typename T>
 struct CommonViewAllocProp;
 
+/* Dummy to allow for empty ViewCtorProp object
+ */
+template <>
+struct ViewCtorProp<void> {};
+
 /* Common value_type stored as ViewCtorProp
  */
 template <typename Specialize, typename T>
-struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T> > {
+struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T>> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -113,7 +114,7 @@ struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T> > {
  *  that avoid duplicate base class errors
  */
 template <unsigned I>
-struct ViewCtorProp<void, std::integral_constant<unsigned, I> > {
+struct ViewCtorProp<void, std::integral_constant<unsigned, I>> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -124,10 +125,10 @@ struct ViewCtorProp<void, std::integral_constant<unsigned, I> > {
 
 /* Property flags have constexpr value */
 template <typename P>
-struct ViewCtorProp<typename std::enable_if<
-                        std::is_same<P, AllowPadding_t>::value ||
-                        std::is_same<P, WithoutInitializing_t>::value>::type,
-                    P> {
+struct ViewCtorProp<
+    std::enable_if_t<std::is_same<P, AllowPadding_t>::value ||
+                     std::is_same<P, WithoutInitializing_t>::value>,
+    P> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -136,13 +137,12 @@ struct ViewCtorProp<typename std::enable_if<
 
   ViewCtorProp(const type &) {}
 
-  static constexpr type value = type();
+  type value = type();
 };
 
 /* Map input label type to std::string */
 template <typename Label>
-struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
-                    Label> {
+struct ViewCtorProp<std::enable_if_t<is_view_label<Label>::value>, Label> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -156,10 +156,9 @@ struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
 };
 
 template <typename Space>
-struct ViewCtorProp<
-    typename std::enable_if<Kokkos::is_memory_space<Space>::value ||
-                            Kokkos::is_execution_space<Space>::value>::type,
-    Space> {
+struct ViewCtorProp<std::enable_if_t<Kokkos::is_memory_space<Space>::value ||
+                                     Kokkos::is_execution_space<Space>::value>,
+                    Space> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
index 691562235..8d367ceba 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp
@@ -58,39 +58,39 @@ namespace Kokkos {
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, 0, 0, 0, 0, 0, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, 0, 0, 0, 0, 0, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1, unsigned ArgN2>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, ArgN2, 0, 0, 0, 0, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, ArgN2, 0, 0, 0, 0, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1, unsigned ArgN2, unsigned ArgN3>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, 0, 0, 0, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, 0, 0, 0, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, 0, 0, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, 0, 0, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4,
           unsigned ArgN5>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, 0, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, 0, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4,
           unsigned ArgN5, unsigned ArgN6>
 struct is_array_layout<Kokkos::Experimental::LayoutTiled<
-    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, 0, true> >
+    OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, 0, true>>
     : public std::true_type {};
 
 template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
@@ -98,7 +98,7 @@ template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN5, unsigned ArgN6, unsigned ArgN7>
 struct is_array_layout<
     Kokkos::Experimental::LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2,
-                                      ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> >
+                                      ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true>>
     : public std::true_type {};
 
 template <class L>
@@ -109,7 +109,7 @@ template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0,
           unsigned ArgN5, unsigned ArgN6, unsigned ArgN7, bool IsPowerTwo>
 struct is_array_layout_tiled<Kokkos::Experimental::LayoutTiled<
     OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7,
-    IsPowerTwo> > : public std::true_type {
+    IsPowerTwo>> : public std::true_type {
 };  // Last template parameter "true" meaning this currently only supports
     // powers-of-two
 
@@ -118,9 +118,9 @@ namespace Impl {
 template <class Dimension, class Layout>
 struct ViewOffset<
     Dimension, Layout,
-    typename std::enable_if<((Dimension::rank <= 8) && (Dimension::rank >= 2) &&
-                             is_array_layout<Layout>::value &&
-                             is_array_layout_tiled<Layout>::value)>::type> {
+    std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) &&
+                      is_array_layout<Layout>::value &&
+                      is_array_layout_tiled<Layout>::value)>> {
  public:
   static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern;
   static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern;
@@ -493,8 +493,14 @@ struct ViewOffset<
   //----------------------------------------
 
   KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N2, m_dim.N3,
-                        m_dim.N4, m_dim.N5, m_dim.N6, m_dim.N7);
+    return array_layout((VORank > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX),
+                        (VORank > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX),
+                        (VORank > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX),
+                        (VORank > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX),
+                        (VORank > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX),
+                        (VORank > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX),
+                        (VORank > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX),
+                        (VORank > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX));
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -660,10 +666,10 @@ struct ViewOffset<
 };
 
 // FIXME Remove the out-of-class definitions when we require C++17
-#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE                                      \
-  typename std::enable_if<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \
-                           is_array_layout<Layout>::value &&                   \
-                           is_array_layout_tiled<Layout>::value)>::type
+#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE                               \
+  std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \
+                    is_array_layout<Layout>::value &&                   \
+                    is_array_layout_tiled<Layout>::value)>
 template <class Dimension, class Layout>
 constexpr Kokkos::Iterate ViewOffset<
     Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern;
@@ -754,18 +760,17 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1>
-class ViewMapping<
-    typename std::enable_if<(N2 == 0 && N3 == 0 && N4 == 0 && N5 == 0 &&
-                             N6 == 0 && N7 == 0)>::type  // void
-    ,
-    Kokkos::ViewTraits<
-        T**,
-        Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4,
-                                          N5, N6, N7, true>,
-        P...>,
-    Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
-                                      N6, N7, true>,
-    iType0, iType1> {
+class ViewMapping<std::enable_if_t<(N2 == 0 && N3 == 0 && N4 == 0 && N5 == 0 &&
+                                    N6 == 0 && N7 == 0)>  // void
+                  ,
+                  Kokkos::ViewTraits<
+                      T**,
+                      Kokkos::Experimental::LayoutTiled<
+                          OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                      P...>,
+                  Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
+                                                    N3, N4, N5, N6, N7, true>,
+                  iType0, iType1> {
  public:
   using src_layout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
@@ -774,11 +779,10 @@ class ViewMapping<
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
-  using traits = Kokkos::ViewTraits<T[N0][N1], array_layout, P...>;
-  using type   = Kokkos::View<T[N0][N1], array_layout, P...>;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
+  using traits       = Kokkos::ViewTraits<T[N0][N1], array_layout, P...>;
+  using type         = Kokkos::View<T[N0][N1], array_layout, P...>;
 
   KOKKOS_INLINE_FUNCTION static void assign(
       ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src,
@@ -807,8 +811,8 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2>
-class ViewMapping<typename std::enable_if<(N3 == 0 && N4 == 0 && N5 == 0 &&
-                                           N6 == 0 && N7 == 0)>::type  // void
+class ViewMapping<std::enable_if_t<(N3 == 0 && N4 == 0 && N5 == 0 && N6 == 0 &&
+                                    N7 == 0)>  // void
                   ,
                   Kokkos::ViewTraits<
                       T***,
@@ -826,11 +830,10 @@ class ViewMapping<typename std::enable_if<(N3 == 0 && N4 == 0 && N5 == 0 &&
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
-  using traits = Kokkos::ViewTraits<T[N0][N1][N2], array_layout, P...>;
-  using type   = Kokkos::View<T[N0][N1][N2], array_layout, P...>;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
+  using traits       = Kokkos::ViewTraits<T[N0][N1][N2], array_layout, P...>;
+  using type         = Kokkos::View<T[N0][N1][N2], array_layout, P...>;
 
   KOKKOS_INLINE_FUNCTION static void assign(
       ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src,
@@ -865,17 +868,17 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2, typename iType3>
-class ViewMapping<typename std::enable_if<(N4 == 0 && N5 == 0 && N6 == 0 &&
-                                           N7 == 0)>::type  // void
-                  ,
-                  Kokkos::ViewTraits<
-                      T****,
-                      Kokkos::Experimental::LayoutTiled<
-                          OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
-                      P...>,
-                  Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                    N3, N4, N5, N6, N7, true>,
-                  iType0, iType1, iType2, iType3> {
+class ViewMapping<
+    std::enable_if_t<(N4 == 0 && N5 == 0 && N6 == 0 && N7 == 0)>  // void
+    ,
+    Kokkos::ViewTraits<
+        T****,
+        Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4,
+                                          N5, N6, N7, true>,
+        P...>,
+    Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
+                                      N6, N7, true>,
+    iType0, iType1, iType2, iType3> {
  public:
   using src_layout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
@@ -884,9 +887,8 @@ class ViewMapping<typename std::enable_if<(N4 == 0 && N5 == 0 && N6 == 0 &&
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
   using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3], array_layout, P...>;
   using type   = Kokkos::View<T[N0][N1][N2][N3], array_layout, P...>;
 
@@ -928,17 +930,16 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2, typename iType3, typename iType4>
-class ViewMapping<
-    typename std::enable_if<(N5 == 0 && N6 == 0 && N7 == 0)>::type  // void
-    ,
-    Kokkos::ViewTraits<
-        T*****,
-        Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4,
-                                          N5, N6, N7, true>,
-        P...>,
-    Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
-                                      N6, N7, true>,
-    iType0, iType1, iType2, iType3, iType4> {
+class ViewMapping<std::enable_if_t<(N5 == 0 && N6 == 0 && N7 == 0)>  // void
+                  ,
+                  Kokkos::ViewTraits<
+                      T*****,
+                      Kokkos::Experimental::LayoutTiled<
+                          OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                      P...>,
+                  Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
+                                                    N3, N4, N5, N6, N7, true>,
+                  iType0, iType1, iType2, iType3, iType4> {
  public:
   using src_layout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
@@ -947,9 +948,8 @@ class ViewMapping<
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
   using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3][N4], array_layout, P...>;
   using type   = Kokkos::View<T[N0][N1][N2][N3][N4], array_layout, P...>;
 
@@ -997,7 +997,7 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2, typename iType3, typename iType4,
           typename iType5>
-class ViewMapping<typename std::enable_if<(N6 == 0 && N7 == 0)>::type  // void
+class ViewMapping<std::enable_if_t<(N6 == 0 && N7 == 0)>  // void
                   ,
                   Kokkos::ViewTraits<
                       T******,
@@ -1015,9 +1015,8 @@ class ViewMapping<typename std::enable_if<(N6 == 0 && N7 == 0)>::type  // void
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
   using traits =
       Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5], array_layout, P...>;
   using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5], array_layout, P...>;
@@ -1071,7 +1070,7 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2, typename iType3, typename iType4,
           typename iType5, typename iType6>
-class ViewMapping<typename std::enable_if<(N7 == 0)>::type  // void
+class ViewMapping<std::enable_if_t<(N7 == 0)>  // void
                   ,
                   Kokkos::ViewTraits<
                       T*******,
@@ -1089,9 +1088,8 @@ class ViewMapping<typename std::enable_if<(N7 == 0)>::type  // void
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
   using traits =
       Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>;
   using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>;
@@ -1151,19 +1149,18 @@ template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0,
           typename iType1, typename iType2, typename iType3, typename iType4,
           typename iType5, typename iType6, typename iType7>
-class ViewMapping<typename std::enable_if<(N0 != 0 && N1 != 0 && N2 != 0 &&
-                                           N3 != 0 && N4 != 0 && N5 != 0 &&
-                                           N6 != 0 && N7 != 0)>::type  // void
-                  ,
-                  Kokkos::ViewTraits<
-                      T********,
-                      Kokkos::Experimental::LayoutTiled<
-                          OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
-                      P...>,
-                  Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                    N3, N4, N5, N6, N7, true>,
-                  iType0, iType1, iType2, iType3, iType4, iType5, iType6,
-                  iType7> {
+class ViewMapping<
+    std::enable_if_t<(N0 != 0 && N1 != 0 && N2 != 0 && N3 != 0 && N4 != 0 &&
+                      N5 != 0 && N6 != 0 && N7 != 0)>  // void
+    ,
+    Kokkos::ViewTraits<
+        T********,
+        Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4,
+                                          N5, N6, N7, true>,
+        P...>,
+    Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
+                                      N6, N7, true>,
+    iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7> {
  public:
   using src_layout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
@@ -1172,9 +1169,8 @@ class ViewMapping<typename std::enable_if<(N0 != 0 && N1 != 0 && N2 != 0 &&
 
   static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left);
   static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left);
-  using array_layout =
-      typename std::conditional<is_inner_left, Kokkos::LayoutLeft,
-                                Kokkos::LayoutRight>::type;
+  using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutRight>;
   using traits =
       Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>;
   using type =
@@ -1244,22 +1240,22 @@ namespace Kokkos {
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T**,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T**,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1272,22 +1268,23 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T***,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T***,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1300,23 +1297,23 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2][N3],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T****,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2,
-             const size_t i_tile3) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2][N3],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T****,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2, const size_t i_tile3) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1329,23 +1326,24 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2][N3][N4],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T*****,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2,
-             const size_t i_tile3, const size_t i_tile4) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2][N3][N4],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T*****,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2, const size_t i_tile3,
+                 const size_t i_tile4) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1358,23 +1356,24 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2][N3][N4][N5],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T******,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2,
-             const size_t i_tile3, const size_t i_tile4, const size_t i_tile5) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2][N3][N4][N5],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T******,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2, const size_t i_tile3,
+                 const size_t i_tile4, const size_t i_tile5) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1387,24 +1386,25 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2][N3][N4][N5][N6],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T*******,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2,
-             const size_t i_tile3, const size_t i_tile4, const size_t i_tile5,
-             const size_t i_tile6) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T*******,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2, const size_t i_tile3,
+                 const size_t i_tile4, const size_t i_tile5,
+                 const size_t i_tile6) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
@@ -1418,24 +1418,25 @@ tile_subview(const Kokkos::View<
 template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
           unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4,
           unsigned N5, unsigned N6, unsigned N7, class... P>
-KOKKOS_INLINE_FUNCTION Kokkos::View<
-    T[N0][N1][N2][N3][N4][N5][N6][N7],
-    typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                              Kokkos::LayoutLeft, Kokkos::LayoutRight>::type,
-    P...>
-tile_subview(const Kokkos::View<
-                 T********,
-                 Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2,
-                                                   N3, N4, N5, N6, N7, true>,
-                 P...>& src,
-             const size_t i_tile0, const size_t i_tile1, const size_t i_tile2,
-             const size_t i_tile3, const size_t i_tile4, const size_t i_tile5,
-             const size_t i_tile6, const size_t i_tile7) {
+KOKKOS_INLINE_FUNCTION
+    Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7],
+                 std::conditional_t<(InnerP == Kokkos::Iterate::Left),
+                                    Kokkos::LayoutLeft, Kokkos::LayoutRight>,
+                 P...>
+    tile_subview(const Kokkos::View<
+                     T********,
+                     Kokkos::Experimental::LayoutTiled<
+                         OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>,
+                     P...>& src,
+                 const size_t i_tile0, const size_t i_tile1,
+                 const size_t i_tile2, const size_t i_tile3,
+                 const size_t i_tile4, const size_t i_tile5,
+                 const size_t i_tile6, const size_t i_tile7) {
   // Force the specialized ViewMapping for extracting a tile
   // by using the first subview argument as the layout.
   using array_layout =
-      typename std::conditional<(InnerP == Kokkos::Iterate::Left),
-                                Kokkos::LayoutLeft, Kokkos::LayoutRight>::type;
+      std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft,
+                         Kokkos::LayoutRight>;
   using SrcLayout =
       Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5,
                                         N6, N7, true>;
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index f606a3983..738231677 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -59,6 +59,7 @@
 #include <impl/Kokkos_ViewCtor.hpp>
 #include <impl/Kokkos_Atomic_View.hpp>
 #include <impl/Kokkos_Tools.hpp>
+#include <impl/Kokkos_StringManipulation.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -111,7 +112,7 @@ struct rank_dynamic<Val, Args...> {
   template <unsigned RD>                                                    \
   struct ViewDimension##R<0u, RD> {                                         \
     static constexpr size_t ArgN##R = 0;                                    \
-    typename std::conditional<(RD < 3), size_t, unsigned>::type N##R;       \
+    std::conditional_t<(RD < 3), size_t, unsigned> N##R;                    \
     ViewDimension##R()                        = default;                    \
     ViewDimension##R(const ViewDimension##R&) = default;                    \
     ViewDimension##R& operator=(const ViewDimension##R&) = default;         \
@@ -196,7 +197,14 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension
   KOKKOS_INLINE_FUNCTION
   constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4,
                           size_t n5, size_t n6, size_t n7)
-      : D0(n0), D1(n1), D2(n2), D3(n3), D4(n4), D5(n5), D6(n6), D7(n7) {}
+      : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0),
+        D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1),
+        D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2),
+        D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3),
+        D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4),
+        D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5),
+        D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6),
+        D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {}
 
   KOKKOS_INLINE_FUNCTION
   constexpr size_t extent(const unsigned r) const noexcept {
@@ -346,13 +354,13 @@ struct is_integral_extent_type<std::initializer_list<iType>> {
 template <unsigned I, class... Args>
 struct is_integral_extent {
   // get_type is void when sizeof...(Args) <= I
-  using type = typename std::remove_cv<typename std::remove_reference<
-      typename Kokkos::Impl::get_type<I, Args...>::type>::type>::type;
+  using type = std::remove_cv_t<std::remove_reference_t<
+      typename Kokkos::Impl::get_type<I, Args...>::type>>;
 
   enum : bool { value = is_integral_extent_type<type>::value };
 
   static_assert(value || std::is_integral<type>::value ||
-                    std::is_same<type, void>::value,
+                    std::is_void<type>::value,
                 "subview argument must be either integral or integral extent");
 };
 
@@ -753,8 +761,8 @@ struct ViewDataType<T, ViewDimension<N, Args...>> {
 template <class T>
 struct ViewArrayAnalysis {
   using value_type           = T;
-  using const_value_type     = typename std::add_const<T>::type;
-  using non_const_value_type = typename std::remove_const<T>::type;
+  using const_value_type     = std::add_const_t<T>;
+  using non_const_value_type = std::remove_const_t<T>;
   using static_dimension     = ViewDimension<>;
   using dynamic_dimension    = ViewDimension<>;
   using dimension            = ViewDimension<>;
@@ -869,8 +877,7 @@ struct ViewOffset {
 template <class Dimension>
 struct ViewOffset<
     Dimension, Kokkos::LayoutLeft,
-    typename std::enable_if<(1 >= Dimension::rank ||
-                             0 == Dimension::rank_dynamic)>::type> {
+    std::enable_if_t<(1 >= Dimension::rank || 0 == Dimension::rank_dynamic)>> {
   using is_mapping_plugin = std::true_type;
   using is_regular        = std::true_type;
 
@@ -973,8 +980,15 @@ struct ViewOffset<
 
   KOKKOS_INLINE_FUNCTION
   constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4,
-                        m_dim.N5, m_dim.N6, m_dim.N7);
+    constexpr auto r = dimension_type::rank;
+    return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX),
+                        (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX),
+                        (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX),
+                        (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX),
+                        (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX),
+                        (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX),
+                        (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX),
+                        (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX));
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -1152,8 +1166,7 @@ struct ViewOffset<
 template <class Dimension>
 struct ViewOffset<
     Dimension, Kokkos::LayoutLeft,
-    typename std::enable_if<(1 < Dimension::rank &&
-                             0 < Dimension::rank_dynamic)>::type> {
+    std::enable_if_t<(1 < Dimension::rank && 0 < Dimension::rank_dynamic)>> {
   using is_mapping_plugin = std::true_type;
   using is_regular        = std::true_type;
 
@@ -1257,8 +1270,15 @@ struct ViewOffset<
 
   KOKKOS_INLINE_FUNCTION
   constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4,
-                        m_dim.N5, m_dim.N6, m_dim.N7);
+    constexpr auto r = dimension_type::rank;
+    return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX),
+                        (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX),
+                        (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX),
+                        (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX),
+                        (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX),
+                        (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX),
+                        (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX),
+                        (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX));
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -1496,8 +1516,7 @@ struct ViewOffset<
 template <class Dimension>
 struct ViewOffset<
     Dimension, Kokkos::LayoutRight,
-    typename std::enable_if<(1 >= Dimension::rank ||
-                             0 == Dimension::rank_dynamic)>::type> {
+    std::enable_if_t<(1 >= Dimension::rank || 0 == Dimension::rank_dynamic)>> {
   using is_mapping_plugin = std::true_type;
   using is_regular        = std::true_type;
 
@@ -1602,8 +1621,15 @@ struct ViewOffset<
 
   KOKKOS_INLINE_FUNCTION
   constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4,
-                        m_dim.N5, m_dim.N6, m_dim.N7);
+    constexpr auto r = dimension_type::rank;
+    return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX),
+                        (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX),
+                        (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX),
+                        (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX),
+                        (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX),
+                        (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX),
+                        (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX),
+                        (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX));
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -1783,8 +1809,7 @@ struct ViewOffset<
 template <class Dimension>
 struct ViewOffset<
     Dimension, Kokkos::LayoutRight,
-    typename std::enable_if<(1 < Dimension::rank &&
-                             0 < Dimension::rank_dynamic)>::type> {
+    std::enable_if_t<(1 < Dimension::rank && 0 < Dimension::rank_dynamic)>> {
   using is_mapping_plugin = std::true_type;
   using is_regular        = std::true_type;
 
@@ -1885,8 +1910,15 @@ struct ViewOffset<
 
   KOKKOS_INLINE_FUNCTION
   constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4,
-                        m_dim.N5, m_dim.N6, m_dim.N7);
+    constexpr auto r = dimension_type::rank;
+    return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX),
+                        (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX),
+                        (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX),
+                        (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX),
+                        (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX),
+                        (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX),
+                        (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX),
+                        (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX));
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -2414,10 +2446,15 @@ struct ViewOffset<Dimension, Kokkos::LayoutStride, void> {
 
   KOKKOS_INLINE_FUNCTION
   constexpr array_layout layout() const {
-    return array_layout(m_dim.N0, m_stride.S0, m_dim.N1, m_stride.S1, m_dim.N2,
-                        m_stride.S2, m_dim.N3, m_stride.S3, m_dim.N4,
-                        m_stride.S4, m_dim.N5, m_stride.S5, m_dim.N6,
-                        m_stride.S6, m_dim.N7, m_stride.S7);
+    constexpr auto r = dimension_type::rank;
+    return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), m_stride.S0,
+                        (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), m_stride.S1,
+                        (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), m_stride.S2,
+                        (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), m_stride.S3,
+                        (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), m_stride.S4,
+                        (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), m_stride.S5,
+                        (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), m_stride.S6,
+                        (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX), m_stride.S7);
   }
 
   KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const {
@@ -2672,11 +2709,11 @@ struct ViewDataHandle {
 
 template <class Traits>
 struct ViewDataHandle<
-    Traits, typename std::enable_if<(
-                std::is_same<typename Traits::non_const_value_type,
-                             typename Traits::value_type>::value &&
-                std::is_same<typename Traits::specialize, void>::value &&
-                Traits::memory_traits::is_atomic)>::type> {
+    Traits,
+    std::enable_if_t<(std::is_same<typename Traits::non_const_value_type,
+                                   typename Traits::value_type>::value &&
+                      std::is_void<typename Traits::specialize>::value &&
+                      Traits::memory_traits::is_atomic)>> {
   using value_type  = typename Traits::value_type;
   using handle_type = typename Kokkos::Impl::AtomicViewDataHandle<Traits>;
   using return_type = typename Kokkos::Impl::AtomicDataElement<Traits>;
@@ -2697,17 +2734,17 @@ struct ViewDataHandle<
 
 template <class Traits>
 struct ViewDataHandle<
-    Traits, typename std::enable_if<(
-                std::is_same<typename Traits::specialize, void>::value &&
-                (!Traits::memory_traits::is_aligned) &&
-                Traits::memory_traits::is_restrict
+    Traits,
+    std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
+                      (!Traits::memory_traits::is_aligned) &&
+                      Traits::memory_traits::is_restrict
 #ifdef KOKKOS_ENABLE_CUDA
-                && (!(std::is_same<typename Traits::memory_space,
-                                   Kokkos::CudaSpace>::value ||
-                      std::is_same<typename Traits::memory_space,
-                                   Kokkos::CudaUVMSpace>::value))
+                      && (!(std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaSpace>::value ||
+                            std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaUVMSpace>::value))
 #endif
-                && (!Traits::memory_traits::is_atomic))>::type> {
+                      && (!Traits::memory_traits::is_atomic))>> {
   using value_type  = typename Traits::value_type;
   using handle_type = typename Traits::value_type* KOKKOS_RESTRICT;
   using return_type = typename Traits::value_type& KOKKOS_RESTRICT;
@@ -2727,17 +2764,17 @@ struct ViewDataHandle<
 
 template <class Traits>
 struct ViewDataHandle<
-    Traits, typename std::enable_if<(
-                std::is_same<typename Traits::specialize, void>::value &&
-                Traits::memory_traits::is_aligned &&
-                (!Traits::memory_traits::is_restrict)
+    Traits,
+    std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
+                      Traits::memory_traits::is_aligned &&
+                      (!Traits::memory_traits::is_restrict)
 #ifdef KOKKOS_ENABLE_CUDA
-                && (!(std::is_same<typename Traits::memory_space,
-                                   Kokkos::CudaSpace>::value ||
-                      std::is_same<typename Traits::memory_space,
-                                   Kokkos::CudaUVMSpace>::value))
+                      && (!(std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaSpace>::value ||
+                            std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaUVMSpace>::value))
 #endif
-                && (!Traits::memory_traits::is_atomic))>::type> {
+                      && (!Traits::memory_traits::is_atomic))>> {
   using value_type = typename Traits::value_type;
   // typedef work-around for intel compilers error #3186: expected typedef
   // declaration
@@ -2773,16 +2810,16 @@ struct ViewDataHandle<
 template <class Traits>
 struct ViewDataHandle<
     Traits,
-    typename std::enable_if<(
-        std::is_same<typename Traits::specialize, void>::value &&
-        Traits::memory_traits::is_aligned && Traits::memory_traits::is_restrict
+    std::enable_if_t<(std::is_void<typename Traits::specialize>::value &&
+                      Traits::memory_traits::is_aligned &&
+                      Traits::memory_traits::is_restrict
 #ifdef KOKKOS_ENABLE_CUDA
-        && (!(std::is_same<typename Traits::memory_space,
-                           Kokkos::CudaSpace>::value ||
-              std::is_same<typename Traits::memory_space,
-                           Kokkos::CudaUVMSpace>::value))
+                      && (!(std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaSpace>::value ||
+                            std::is_same<typename Traits::memory_space,
+                                         Kokkos::CudaUVMSpace>::value))
 #endif
-        && (!Traits::memory_traits::is_atomic))>::type> {
+                      && (!Traits::memory_traits::is_atomic))>> {
   using value_type = typename Traits::value_type;
   // typedef work-around for intel compilers error #3186: expected typedef
   // declaration
@@ -2863,6 +2900,7 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
   size_t n;
   bool destroy;
   std::string name;
+  bool default_exec_space;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t i) const {
@@ -2885,13 +2923,26 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
         ptr(arg_ptr),
         n(arg_n),
         destroy(false),
-        name(std::move(arg_name)) {}
+        name(std::move(arg_name)),
+        default_exec_space(false) {}
+
+  ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n,
+                   std::string arg_name)
+      : space(ExecSpace{}),
+        ptr(arg_ptr),
+        n(arg_n),
+        destroy(false),
+        name(std::move(arg_name)),
+        default_exec_space(true) {}
 
   template <typename Dummy = ValueType>
   std::enable_if_t<std::is_trivial<Dummy>::value &&
                    std::is_trivially_copy_assignable<ValueType>::value>
   construct_dispatch() {
     ValueType value{};
+// On A64FX memset seems to do the wrong thing with regards to first touch
+// leading to the significant performance issues
+#ifndef KOKKOS_ARCH_A64FX
     if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
@@ -2903,7 +2954,6 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
             "Kokkos::View::initialization [" + name + "] via memset",
             Kokkos::Profiling::Experimental::device_id(space), &kpID);
       }
-
       (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
                        Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
           space,
@@ -2914,9 +2964,14 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
       }
+      if (default_exec_space)
+        space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
     } else {
+#endif
       parallel_for_implementation(false);
+#ifndef KOKKOS_ARCH_A64FX
     }
+#endif
   }
 
   template <typename Dummy = ValueType>
@@ -2950,7 +3005,8 @@ struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, policy);
       closure.execute();
-      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
+      if (default_exec_space || destroy)
+        space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
       }
@@ -2973,6 +3029,7 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
   ValueType* ptr;
   size_t n;
   std::string name;
+  bool default_exec_space;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t i) const { ptr[i] = ValueType(); }
@@ -2983,7 +3040,19 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
 
   ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr,
                    size_t const arg_n, std::string arg_name)
-      : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {}
+      : space(arg_space),
+        ptr(arg_ptr),
+        n(arg_n),
+        name(std::move(arg_name)),
+        default_exec_space(false) {}
+
+  ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n,
+                   std::string arg_name)
+      : space(ExecSpace{}),
+        ptr(arg_ptr),
+        n(arg_n),
+        name(std::move(arg_name)),
+        default_exec_space(true) {}
 
   template <typename Dummy = ValueType>
   std::enable_if_t<std::is_trivial<Dummy>::value &&
@@ -2991,6 +3060,9 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
   construct_shared_allocation() {
     // Shortcut for zero initialization
     ValueType value{};
+// On A64FX memset seems to do the wrong thing with regards to first touch
+// leading to the significant performance issues
+#ifndef KOKKOS_ARCH_A64FX
     if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
@@ -3013,9 +3085,14 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
       }
+      if (default_exec_space)
+        space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
     } else {
+#endif
       parallel_for_implementation();
+#ifndef KOKKOS_ARCH_A64FX
     }
+#endif
   }
 
   template <typename Dummy = ValueType>
@@ -3044,8 +3121,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, PolicyType(0, n));
       closure.execute();
-      space.fence(
-          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in view");
+      if (default_exec_space)
+        space.fence(
+            "Kokkos::Impl::ViewValueFunctor: Fence after setting values in "
+            "view");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
       }
@@ -3062,10 +3141,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
 template <class Traits>
 class ViewMapping<
     Traits,
-    typename std::enable_if<(
-        std::is_same<typename Traits::specialize, void>::value &&
+    std::enable_if_t<(
+        std::is_void<typename Traits::specialize>::value &&
         ViewOffset<typename Traits::dimension, typename Traits::array_layout,
-                   void>::is_mapping_plugin::value)>::type> {
+                   void>::is_mapping_plugin::value)>> {
  public:
   using offset_type = ViewOffset<typename Traits::dimension,
                                  typename Traits::array_layout, void>;
@@ -3196,26 +3275,26 @@ class ViewMapping<
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(std::is_integral<I0>::value &&
-                               // if layout is neither stride nor irregular,
-                               // then just use the handle directly
-                               !(std::is_same<typename Traits::array_layout,
-                                              Kokkos::LayoutStride>::value ||
-                                 !is_regular::value)),
-                              reference_type>::type
+      std::enable_if_t<(std::is_integral<I0>::value &&
+                        // if layout is neither stride nor irregular,
+                        // then just use the handle directly
+                        !(std::is_same<typename Traits::array_layout,
+                                       Kokkos::LayoutStride>::value ||
+                          !is_regular::value)),
+                       reference_type>
       reference(const I0& i0) const {
     return m_impl_handle[i0];
   }
 
   template <typename I0>
   KOKKOS_FORCEINLINE_FUNCTION
-      typename std::enable_if<(std::is_integral<I0>::value &&
-                               // if the layout is strided or irregular, then
-                               // we have to use the offset
-                               (std::is_same<typename Traits::array_layout,
-                                             Kokkos::LayoutStride>::value ||
-                                !is_regular::value)),
-                              reference_type>::type
+      std::enable_if_t<(std::is_integral<I0>::value &&
+                        // if the layout is strided or irregular, then
+                        // we have to use the offset
+                        (std::is_same<typename Traits::array_layout,
+                                      Kokkos::LayoutStride>::value ||
+                         !is_regular::value)),
+                       reference_type>
       reference(const I0& i0) const {
     return m_impl_handle[m_impl_offset(i0)];
   }
@@ -3334,7 +3413,8 @@ class ViewMapping<
   template <class... P>
   Kokkos::Impl::SharedAllocationRecord<>* allocate_shared(
       Kokkos::Impl::ViewCtorProp<P...> const& arg_prop,
-      typename Traits::array_layout const& arg_layout) {
+      typename Traits::array_layout const& arg_layout,
+      bool execution_space_specified) {
     using alloc_prop = Kokkos::Impl::ViewCtorProp<P...>;
 
     using execution_space = typename alloc_prop::execution_space;
@@ -3361,13 +3441,22 @@ class ViewMapping<
         static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const&>(
             arg_prop)
             .value;
-    // Create shared memory tracking record with allocate memory from the memory
-    // space
-    record_type* const record = record_type::allocate(
+    const execution_space& exec_space =
+        static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const&>(
+            arg_prop)
+            .value;
+    const memory_space& mem_space =
         static_cast<Kokkos::Impl::ViewCtorProp<void, memory_space> const&>(
             arg_prop)
-            .value,
-        alloc_name, alloc_size);
+            .value;
+
+    // Create shared memory tracking record with allocate memory from the memory
+    // space
+    record_type* const record =
+        execution_space_specified
+            ? record_type::allocate(exec_space, mem_space, alloc_name,
+                                    alloc_size)
+            : record_type::allocate(mem_space, alloc_name, alloc_size);
 
     m_impl_handle = handle_type(reinterpret_cast<pointer_type>(record->data()));
 
@@ -3377,11 +3466,12 @@ class ViewMapping<
       // Assume destruction is only required when construction is requested.
       // The ViewValueFunctor has both value construction and destruction
       // operators.
-      record->m_destroy = functor_type(
-          static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const&>(
-              arg_prop)
-              .value,
-          (value_type*)m_impl_handle, m_impl_offset.span(), alloc_name);
+      record->m_destroy =
+          execution_space_specified
+              ? functor_type(exec_space, (value_type*)m_impl_handle,
+                             m_impl_offset.span(), alloc_name)
+              : functor_type((value_type*)m_impl_handle, m_impl_offset.span(),
+                             alloc_name);
 
       // Construct values
       record->m_destroy.construct_shared_allocation();
@@ -3398,13 +3488,13 @@ class ViewMapping<
 template <class DstTraits, class SrcTraits>
 class ViewMapping<
     DstTraits, SrcTraits,
-    typename std::enable_if<(
+    std::enable_if_t<(
         !(std::is_same<typename SrcTraits::array_layout, LayoutStride>::
               value) &&  // Added to have a new specialization for SrcType of
                          // LayoutStride
         // default mappings
-        std::is_same<typename DstTraits::specialize, void>::value &&
-        std::is_same<typename SrcTraits::specialize, void>::value &&
+        std::is_void<typename DstTraits::specialize>::value &&
+        std::is_void<typename SrcTraits::specialize>::value &&
         (
             // same layout
             std::is_same<typename DstTraits::array_layout,
@@ -3421,7 +3511,7 @@ class ViewMapping<
               std::is_same<typename SrcTraits::array_layout,
                            Kokkos::LayoutRight>::value ||
               std::is_same<typename SrcTraits::array_layout,
-                           Kokkos::LayoutStride>::value))))>::type> {
+                           Kokkos::LayoutStride>::value))))>> {
  private:
   enum {
     is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
@@ -3539,11 +3629,11 @@ class ViewMapping<
 template <class DstTraits, class SrcTraits>
 class ViewMapping<
     DstTraits, SrcTraits,
-    typename std::enable_if<(
+    std::enable_if_t<(
         std::is_same<typename SrcTraits::array_layout,
                      Kokkos::LayoutStride>::value &&
-        std::is_same<typename DstTraits::specialize, void>::value &&
-        std::is_same<typename SrcTraits::specialize, void>::value &&
+        std::is_void<typename DstTraits::specialize>::value &&
+        std::is_void<typename SrcTraits::specialize>::value &&
         (
             // same layout
             std::is_same<typename DstTraits::array_layout,
@@ -3554,7 +3644,7 @@ class ViewMapping<
              std::is_same<typename DstTraits::array_layout,
                           Kokkos::LayoutRight>::value ||
              std::is_same<typename DstTraits::array_layout,
-                          Kokkos::LayoutStride>::value)))>::type> {
+                          Kokkos::LayoutStride>::value)))>> {
  private:
   enum {
     is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
@@ -3704,8 +3794,7 @@ struct SubViewDataTypeImpl<void, ValueType, Kokkos::Experimental::Extents<>> {
 template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class Integral,
           class... Args>
 struct SubViewDataTypeImpl<
-    typename std::enable_if<
-        std::is_integral<typename std::decay<Integral>::type>::value>::type,
+    std::enable_if_t<std::is_integral<std::decay_t<Integral>>::value>,
     ValueType, Kokkos::Experimental::Extents<Ext, Exts...>, Integral, Args...>
     : SubViewDataTypeImpl<void, ValueType,
                           Kokkos::Experimental::Extents<Exts...>, Args...> {};
@@ -3725,7 +3814,7 @@ struct SubViewDataTypeImpl<void, ValueType,
 template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class PairLike,
           class... Args>
 struct SubViewDataTypeImpl<
-    typename std::enable_if<is_pair_like<PairLike>::value>::type, ValueType,
+    std::enable_if_t<is_pair_like<PairLike>::value>, ValueType,
     Kokkos::Experimental::Extents<Ext, Exts...>, PairLike, Args...>
     : SubViewDataTypeImpl<
           void, typename make_all_extents_into_pointers<ValueType>::type*,
@@ -3738,14 +3827,13 @@ struct SubViewDataType : SubViewDataTypeImpl<void, ValueType, Exts, Args...> {};
 
 template <class SrcTraits, class... Args>
 class ViewMapping<
-    typename std::enable_if<(
-        std::is_same<typename SrcTraits::specialize, void>::value &&
-        (std::is_same<typename SrcTraits::array_layout,
-                      Kokkos::LayoutLeft>::value ||
-         std::is_same<typename SrcTraits::array_layout,
-                      Kokkos::LayoutRight>::value ||
-         std::is_same<typename SrcTraits::array_layout,
-                      Kokkos::LayoutStride>::value))>::type,
+    std::enable_if_t<(std::is_void<typename SrcTraits::specialize>::value &&
+                      (std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutLeft>::value ||
+                       std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutRight>::value ||
+                       std::is_same<typename SrcTraits::array_layout,
+                                    Kokkos::LayoutStride>::value))>,
     SrcTraits, Args...> {
  private:
   static_assert(SrcTraits::rank == sizeof...(Args),
@@ -3792,7 +3880,7 @@ class ViewMapping<
   };
 
   // Subview's layout
-  using array_layout = typename std::conditional<
+  using array_layout = std::conditional_t<
       (            /* Same array layout IF */
        (rank == 0) /* output rank zero */
        || SubviewLegalArgsCompileTime<typename SrcTraits::array_layout,
@@ -3810,7 +3898,7 @@ class ViewMapping<
         std::is_same<typename SrcTraits::array_layout,
                      Kokkos::LayoutRight>::value)  // replace input rank
        ),
-      typename SrcTraits::array_layout, Kokkos::LayoutStride>::type;
+      typename SrcTraits::array_layout, Kokkos::LayoutStride>;
 
   using value_type = typename SrcTraits::value_type;
 
@@ -3920,7 +4008,8 @@ struct OperatorBoundsErrorOnDevice<MapType, true> {
   KOKKOS_INLINE_FUNCTION
   static void run(MapType const& map) {
     SharedAllocationHeader const* const header =
-        SharedAllocationHeader::get_header((void*)(map.data()));
+        SharedAllocationHeader::get_header(
+            static_cast<void const*>(map.data()));
     char const* const label = header->label();
     enum { LEN = 128 };
     char msg[LEN];
@@ -3986,6 +4075,62 @@ KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
   }
 }
 
+// primary template: memory space is accessible, do nothing.
+template <class MemorySpace, class AccessSpace,
+          bool = SpaceAccessibility<AccessSpace, MemorySpace>::accessible>
+struct RuntimeCheckViewMemoryAccessViolation {
+  template <class Track, class Map>
+  KOKKOS_FUNCTION RuntimeCheckViewMemoryAccessViolation(char const* const,
+                                                        Track const&,
+                                                        Map const&) {}
+};
+
+// explicit specialization: memory access violation will occur, call abort with
+// the specified error message.
+template <class MemorySpace, class AccessSpace>
+struct RuntimeCheckViewMemoryAccessViolation<MemorySpace, AccessSpace, false> {
+  template <class Track, class Map>
+  KOKKOS_FUNCTION RuntimeCheckViewMemoryAccessViolation(char const* const msg,
+                                                        Track const& track,
+                                                        Map const&) {
+    char err[256] = "";
+    strncat(err, msg, 64);
+    strcat(err, " (label=\"");
+
+    KOKKOS_IF_ON_HOST(({
+      auto const tracker = track.m_tracker;
+
+      if (tracker.has_record()) {
+        strncat(err, tracker.template get_label<void>().c_str(), 128);
+      } else {
+        strcat(err, "**UNMANAGED**");
+      }
+    }))
+
+    KOKKOS_IF_ON_DEVICE(({
+      strcat(err, "**UNAVAILABLE**");
+      (void)track;
+    }))
+
+    strcat(err, "\")");
+
+    Kokkos::abort(err);
+  }
+};
+
+template <class MemorySpace, class Track, class Map, class... Ignore>
+KOKKOS_FUNCTION void runtime_check_memory_access_violation(
+    char const* const msg, Track const& track, Map const& map, Ignore...) {
+  KOKKOS_IF_ON_HOST(
+      ((void)RuntimeCheckViewMemoryAccessViolation<MemorySpace,
+                                                   DefaultHostExecutionSpace>(
+           msg, track, map);))
+  KOKKOS_IF_ON_DEVICE(
+      ((void)RuntimeCheckViewMemoryAccessViolation<MemorySpace,
+                                                   DefaultExecutionSpace>(
+           msg, track, map);))
+}
+
 } /* namespace Impl */
 } /* namespace Kokkos */
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
index 972b1b6d9..cfa30f6e7 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
@@ -60,7 +60,7 @@ namespace Impl {
  * constructors that match the view.  The constructors and assignments
  * from view will externalize the logic needed to enable/disable
  * ref counting to provide a single gate to enable further developments
- * which may hing on the same logic.
+ * which may hinge on the same logic.
  *
  */
 template <class ParentView>
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp
index 2eb8fc9e3..13ed4df6a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp
@@ -76,15 +76,13 @@ struct ViewUniformLayout<Kokkos::LayoutRight, 1> {
 
 template <class ViewType, int Traits>
 struct ViewUniformType {
-  using data_type = typename ViewType::data_type;
-  using const_data_type =
-      typename std::add_const<typename ViewType::data_type>::type;
+  using data_type       = typename ViewType::data_type;
+  using const_data_type = std::add_const_t<typename ViewType::data_type>;
   using runtime_data_type =
       typename ViewScalarToDataType<typename ViewType::value_type,
                                     ViewType::rank>::type;
   using runtime_const_data_type = typename ViewScalarToDataType<
-      typename std::add_const<typename ViewType::value_type>::type,
-      ViewType::rank>::type;
+      std::add_const_t<typename ViewType::value_type>, ViewType::rank>::type;
 
   using array_layout =
       typename ViewUniformLayout<typename ViewType::array_layout,
diff --git a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp
index 04507b098..a0d1bc4cb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #define DEBUG_PRINT 0
 
 #include <iostream>
@@ -234,7 +238,6 @@ unsigned thread_mapping(const char* const label, const bool allow_async,
 
 #include <iostream>
 #include <sstream>
-#include <stdexcept>
 
 /*--------------------------------------------------------------------------*/
 /* Third Party Libraries */
@@ -272,9 +275,9 @@ enum { MAX_CORE = 1024 };
 
 std::pair<unsigned, unsigned> s_core_topology(0, 0);
 unsigned s_core_capacity(0);
-hwloc_topology_t s_hwloc_topology(0);
-hwloc_bitmap_t s_hwloc_location(0);
-hwloc_bitmap_t s_process_binding(0);
+hwloc_topology_t s_hwloc_topology(nullptr);
+hwloc_bitmap_t s_hwloc_location(nullptr);
+hwloc_bitmap_t s_process_binding(nullptr);
 hwloc_bitmap_t s_core[MAX_CORE];
 bool s_can_bind_threads(true);
 
@@ -286,13 +289,13 @@ struct Sentinel {
 bool sentinel() {
   static Sentinel self;
 
-  if (0 == s_hwloc_topology) {
+  if (nullptr == s_hwloc_topology) {
     std::cerr << "Kokkos::hwloc ERROR : Called after return from main()"
               << std::endl;
     std::cerr.flush();
   }
 
-  return 0 != s_hwloc_topology;
+  return nullptr != s_hwloc_topology;
 }
 
 Sentinel::~Sentinel() {
@@ -303,9 +306,9 @@ Sentinel::~Sentinel() {
   s_core_topology.first  = 0;
   s_core_topology.second = 0;
   s_core_capacity        = 0;
-  s_hwloc_topology       = 0;
-  s_hwloc_location       = 0;
-  s_process_binding      = 0;
+  s_hwloc_topology       = nullptr;
+  s_hwloc_location       = nullptr;
+  s_process_binding      = nullptr;
 }
 
 Sentinel::Sentinel() {
@@ -317,11 +320,11 @@ Sentinel::Sentinel() {
 
   s_core_topology   = std::pair<unsigned, unsigned>(0, 0);
   s_core_capacity   = 0;
-  s_hwloc_topology  = 0;
-  s_hwloc_location  = 0;
-  s_process_binding = 0;
+  s_hwloc_topology  = nullptr;
+  s_hwloc_location  = nullptr;
+  s_process_binding = nullptr;
 
-  for (unsigned i = 0; i < MAX_CORE; ++i) s_core[i] = 0;
+  for (unsigned i = 0; i < MAX_CORE; ++i) s_core[i] = nullptr;
 
   hwloc_topology_init(&s_hwloc_topology);
   hwloc_topology_load(s_hwloc_topology);
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
index 8551856aa..983a71a56 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp
@@ -125,8 +125,6 @@
 #else
 #define KOKKOS_DEFAULTED_FUNCTION inline
 #endif
-#define KOKKOS_IMPL_HOST_FUNCTION __host__
-#define KOKKOS_IMPL_DEVICE_FUNCTION __device__
 
 #if (CUDA_VERSION >= 10000)
 #define KOKKOS_CUDA_ENABLE_GRAPHS
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index 32236e963..b203e9afb 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -45,6 +45,15 @@
 #ifndef KOKKOS_SETUP_SYCL_HPP_
 #define KOKKOS_SETUP_SYCL_HPP_
 
+// FIXME_SYCL the fallback assert is temporarily disabled by default in the
+// compiler so we need to force it
+#ifndef SYCL_ENABLE_FALLBACK_ASSERT
+#define SYCL_ENABLE_FALLBACK_ASSERT
+#endif
+#ifndef SYCL_FALLBACK_ASSERT
+#define SYCL_FALLBACK_ASSERT 1
+#endif
+
 #include <CL/sycl.hpp>
 
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
index 7bd96ab53..f306e43a0 100644
--- a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -109,9 +109,14 @@ struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
   //   we should benchmark this assumption if it becomes a problem.
   template <class T>
   using trait_matches_specification = std::integral_constant<
-      bool, !std::is_void<T>::value &&
-                !type_list_any<_trait_matches_spec_predicate<T>::template apply,
-                               _exec_policy_traits_without_work_tag>::value>;
+      bool,
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_3
+      std::is_empty<T>::value &&
+#else
+      !std::is_void<T>::value &&
+#endif
+          !type_list_any<_trait_matches_spec_predicate<T>::template apply,
+                         _exec_policy_traits_without_work_tag>::value>;
 };
 
 // </editor-fold> end trait specification }}}1
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 0d968b89f..24f70c0cc 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -58,6 +58,8 @@ SET(KOKKOS_SYCL_FEATURE_LEVEL 999)
 SET(KOKKOS_SYCL_NAME Experimental::SYCL)
 SET(KOKKOS_THREADS_FEATURE_LEVEL 999)
 SET(KOKKOS_THREADS_NAME Threads)
+SET(KOKKOS_OPENACC_FEATURE_LEVEL 3)
+SET(KOKKOS_OPENACC_NAME Experimental::OpenACC)
 
 
 #
@@ -70,8 +72,11 @@ KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_
 KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
 SET(COMPILE_ONLY_SOURCES
+  TestArray.cpp
   TestDetectionIdiom.cpp
   TestInterOp.cpp
+  TestLegionInteroperability.cpp
+  TestStringManipulation.cpp
   TestTypeList.cpp
 )
 # TestInterOp has a dependency on containers
@@ -116,10 +121,12 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         ExecutionSpace
         FunctorAnalysis
         Init
+        JoinBackwardCompatibility
         LocalDeepCopy
         MinMaxClamp
         MathematicalConstants
-        MathematicalFunctions
+        MathematicalFunctions1
+        MathematicalFunctions2
         MDRange_a
         MDRange_b
         MDRange_c
@@ -172,8 +179,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       list(APPEND ${Tag}_SOURCES1B ${file})
     endforeach()
 
-    SET(${Tag}_SOURCES2A)
-    foreach(Name
+    SET(SOURCES2A_NAME_LIST
+      Abort
       TeamBasic
       TeamReductionScan
       TeamScan
@@ -188,14 +195,27 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       ViewAPI_e
       ViewCopy_a
       ViewCopy_b
+      ViewCtorDimMatch
+      ViewHooks
       ViewLayoutStrideAssignment
       ViewMapping_b
       ViewMapping_subview
+      ViewMemoryAccessViolation
       ViewOfClass
       ViewResize
       View_64bit
       WorkGraph
+      WithoutInitializing
+    )
+    IF(KOKKOS_HAS_TRILINOS)
+      LIST(REMOVE_ITEM SOURCES2A_NAME_LIST
+        Abort
+        ViewMemoryAccessViolation
       )
+    ENDIF()
+
+    SET(${Tag}_SOURCES2A)
+    foreach(Name ${SOURCES2A_NAME_LIST})
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
       # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
@@ -211,7 +231,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     if (Tag STREQUAL "Cuda")
       set(TagHostAccessible CudaUVM)
     elseif(Tag STREQUAL "HIP")
-      set(TagHostAccessible HIPHostPinned)
+      set(TagHostAccessible HIPManaged)
     elseif(Tag STREQUAL "SYCL")
       set(TagHostAccessible SYCLSharedUSM)
     endif()
@@ -262,7 +282,6 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
       SubView_c12
       SubView_c13
       SubView_c14
-      WithoutInitializing
       )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -281,7 +300,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   endif()
 endforeach()
 
-foreach(PairDeviceSpace HIP-HostPinned;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
+foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
   string(REGEX REPLACE "([^-]*)-(.*)" "\\1" DEVICE ${PairDeviceSpace})
   string(REGEX REPLACE "([^-]*)-(.*)" "\\2" SPACE ${PairDeviceSpace})
 
@@ -450,6 +469,7 @@ endif()
 if (Kokkos_ENABLE_OPENMP)
   set(OpenMP_EXTRA_SOURCES
     openmp/TestOpenMP_Task.cpp
+    openmp/TestOpenMP_PartitionMaster.cpp
   )
   if (Kokkos_ENABLE_DEPRECATED_CODE_3)
     list(APPEND OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp)
@@ -492,25 +512,10 @@ if(Kokkos_ENABLE_HPX)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HPX_IndependentInstances
     SOURCES
-      UnitTestMain.cpp
+      UnitTestMainInit.cpp
       hpx/TestHPX_IndependentInstances.cpp
-  )
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPX_IndependentInstancesDelayedExecution
-    SOURCES
-      UnitTestMain.cpp
       hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
-  )
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPX_IndependentInstancesInstanceIds
-    SOURCES
-      UnitTestMain.cpp
       hpx/TestHPX_IndependentInstancesInstanceIds.cpp
-  )
-  KOKKOS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_HPX_IndependentInstancesRefCounting
-    SOURCES
-      UnitTestMain.cpp
       hpx/TestHPX_IndependentInstancesRefCounting.cpp
   )
 endif()
@@ -585,6 +590,8 @@ if(Kokkos_ENABLE_HIP)
       UnitTestMainInit.cpp
       ${HIP_SOURCES}
       hip/TestHIP_ScanUnit.cpp
+      hip/TestHIP_Spaces.cpp
+      hip/TestHIP_Memory_Requirements.cpp
       hip/TestHIP_TeamScratchStreams.cpp
       hip/TestHIP_AsyncLauncher.cpp
       hip/TestHIP_BlocksizeDeduction.cpp
@@ -690,11 +697,15 @@ endif()
 if (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
+    TestInitializationSettings.cpp
+    TestParseCmdLineArgsAndEnvVars.cpp
     default/TestDefaultDeviceType.cpp
   )
 else()
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
+    TestInitializationSettings.cpp
+    TestParseCmdLineArgsAndEnvVars.cpp
     default/TestDefaultDeviceType.cpp
     default/TestDefaultDeviceType_a1.cpp
     default/TestDefaultDeviceType_b1.cpp
@@ -827,7 +838,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       NAME ProfilingTestLibraryCmdLineHelp
       EXE  ProfilingAllCalls
       ARGS --kokkos-tools-help
-           --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
+           --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool>
       PASS_REGULAR_EXPRESSION
         "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::")
 
@@ -853,7 +864,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       NAME ProfilingTestLibraryCmdLine
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
-            --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
+            --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool>
       PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
@@ -878,12 +889,14 @@ KOKKOS_ADD_TEST( NAME UnitTest_StackTraceTest
                )
 endif()
 
-foreach(INITTESTS_NUM RANGE 1 18)
-KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_DefaultInit_${INITTESTS_NUM}
-  SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
-)
-endforeach(INITTESTS_NUM)
+if(Kokkos_ENABLE_DEPRECATED_CODE_3)
+  foreach(INITTESTS_NUM RANGE 1 18)
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_DefaultInit_${INITTESTS_NUM}
+    SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
+  )
+  endforeach(INITTESTS_NUM)
+endif()
 
 if (KOKKOS_ENABLE_HWLOC)
 KOKKOS_ADD_EXECUTABLE_AND_TEST(
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
index 570cee022..0c3e1ee47 100644
--- a/packages/kokkos/core/unit_test/Makefile
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -289,6 +289,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
 	OBJ_HIP += TestHIP_Reductions.o
 	OBJ_HIP += TestHIP_MDRange_a.o TestHIP_MDRange_b.o TestHIP_MDRange_c.o TestHIP_MDRange_d.o TestHIP_MDRange_e.o
 	OBJ_HIP += TestHIP_Spaces.o
+	OBJ_HIP += TestHIP_Memory_Requirements.o
 	OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o
 	OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o
 	OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o
@@ -408,12 +409,14 @@ TEST_TARGETS += test-stack-trace
 TEST_TARGETS += test-stack-trace-terminate
 TEST_TARGETS += test-stack-trace-generic-term
 
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
 NUM_INITTESTS = 16
 INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS})
 INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS})
 TARGETS += ${INITTESTS_TARGETS}
 INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
 TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
+endif
 
 KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Cuda
@@ -508,7 +511,7 @@ test-push-finalize-hook: KokkosCore_UnitTest_PushFinalizeHook
 
 test-push-finalize-hook-terminate: KokkosCore_UnitTest_PushFinalizeHook_terminate
 	./KokkosCore_UnitTest_PushFinalizeHook_terminate
-	
+
 test-stack-trace: KokkosCore_UnitTest_StackTraceTestExec
 	./KokkosCore_UnitTest_StackTraceTestExec --gtest_filter=*normal$(STACK_TRACE_TERMINATE_FILTER)
 
diff --git a/packages/kokkos/core/unit_test/TestAbort.hpp b/packages/kokkos/core/unit_test/TestAbort.hpp
new file mode 100644
index 000000000..58da329c3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAbort.hpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <regex>
+#include <Kokkos_Core.hpp>
+
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
+TEST(TEST_CATEGORY_DEATH, abort_from_host) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  char msg[] = "Goodbye cruel world";
+  EXPECT_DEATH({ Kokkos::abort(msg); }, msg);
+}
+
+template <class ExecutionSpace>
+struct TestAbortPrintingToStdout {
+  TestAbortPrintingToStdout() {
+    ::testing::internal::CaptureStdout();
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
+    Kokkos::fence();
+    auto const captured = ::testing::internal::GetCapturedStdout();
+    EXPECT_TRUE(std::regex_search(captured,
+                                  std::regex("move along nothing to see here")))
+        << "here is what was printed to stdout \"" << captured << "\"";
+  }
+  KOKKOS_FUNCTION void operator()(int) const {
+    Kokkos::abort("move along nothing to see here");
+  }
+};
+
+template <class ExecutionSpace>
+struct TestAbortCausingAbnormalProgramTerminationButIgnoringErrorMessage {
+  TestAbortCausingAbnormalProgramTerminationButIgnoringErrorMessage() {
+    EXPECT_DEATH(
+        {
+          Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, 1),
+                               *this);
+          Kokkos::fence();
+        },
+        ".*");
+  }
+  KOKKOS_FUNCTION void operator()(int) const { Kokkos::abort("ignored"); }
+};
+
+template <class ExecutionSpace>
+struct TestAbortCausingAbnormalProgramTerminationAndPrinting {
+  TestAbortCausingAbnormalProgramTerminationAndPrinting() {
+    EXPECT_DEATH(
+        {
+          Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, 1),
+                               *this);
+          Kokkos::fence();
+        },
+        "Meurs, pourriture communiste !");
+  }
+  KOKKOS_FUNCTION void operator()(int) const {
+    Kokkos::abort("Meurs, pourriture communiste !");
+  }
+};
+
+template <class ExecutionSpace>
+void test_abort_from_device() {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // FIXME_OPENMPTARGET
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenMPTarget>::value) {
+    TestAbortPrintingToStdout<ExecutionSpace>();
+  } else {
+    TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+  }
+#elif defined(KOKKOS_ENABLE_SYCL)  // FIXME_SYCL
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) {
+#ifdef NDEBUG
+    TestAbortPrintingToStdout<ExecutionSpace>();
+#else
+    TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+#endif
+  } else {
+    TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+  }
+#elif defined(KOKKOS_IMPL_HIP_ABORT_DOES_NOT_PRINT_MESSAGE)
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value) {
+    TestAbortCausingAbnormalProgramTerminationButIgnoringErrorMessage<
+        ExecutionSpace>();
+  } else {
+    TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+  }
+#else
+  TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>();
+#endif
+}
+
+TEST(TEST_CATEGORY_DEATH, abort_from_device) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  test_abort_from_device<TEST_EXECSPACE>();
+}
+#endif
diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp
index 7590c6f1f..d21e6f46d 100644
--- a/packages/kokkos/core/unit_test/TestAggregate.hpp
+++ b/packages/kokkos/core/unit_test/TestAggregate.hpp
@@ -45,13 +45,7 @@
 #ifndef TEST_AGGREGATE_HPP
 #define TEST_AGGREGATE_HPP
 
-#include <gtest/gtest.h>
-
-#include <stdexcept>
-#include <sstream>
-#include <iostream>
-
-#include <impl/Kokkos_ViewArray.hpp>
+#include <Kokkos_Core.hpp>
 
 namespace Test {
 
@@ -78,8 +72,7 @@ void TestViewAggregate() {
   static_assert(a32_traits::rank == 2, "");
   static_assert(a32_traits::rank_dynamic == 2, "");
 
-  static_assert(std::is_same<typename flat_traits::specialize, void>::value,
-                "");
+  static_assert(std::is_void<typename flat_traits::specialize>::value, "");
   static_assert(flat_traits::rank == 3, "");
   static_assert(flat_traits::rank_dynamic == 2, "");
   static_assert(flat_traits::dimension::N2 == 32, "");
diff --git a/packages/kokkos/core/unit_test/TestArray.cpp b/packages/kokkos/core/unit_test/TestArray.cpp
new file mode 100644
index 000000000..ca93918e0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestArray.cpp
@@ -0,0 +1,84 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Array.hpp>
+
+namespace {
+
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+KOKKOS_FUNCTION constexpr bool test_array() {
+  constexpr Kokkos::Array<int, 3> a{{1, 2}};
+
+  STATIC_ASSERT(!a.empty());
+  STATIC_ASSERT(a.size() == 3);
+  STATIC_ASSERT(a.max_size() == 3);
+
+  STATIC_ASSERT(*a.data() == 1);
+  STATIC_ASSERT(a[1] == 2);
+
+  return true;
+}
+
+STATIC_ASSERT(test_array());
+
+#ifdef KOKKOS_ENABLE_CXX17
+KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() {
+  constexpr Kokkos::Array<float, 2> a{};
+  auto& [xr, yr] = a;
+  (void)xr;
+  (void)yr;
+  auto [x, y] = a;
+  (void)x;
+  (void)y;
+  auto const& [xcr, ycr] = a;
+  (void)xcr;
+  (void)ycr;
+  return true;
+}
+
+STATIC_ASSERT(test_array_structured_binding_support());
+#endif
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
index 1ec175710..ab9b970be 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -705,7 +705,6 @@ bool DivAtomicTest(T i0, T i1) {
   bool passed = true;
 
   using Kokkos::abs;
-  using std::abs;
   if (abs((resSerial - res) * 1.) > 1e-5) {
     passed = false;
 
diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
index 88f1aee63..916dc949b 100644
--- a/packages/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -73,8 +73,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type& update,
-                   const volatile value_type& input) {
+  static void join(value_type& update, const value_type& input) {
     update |= input;
   }
 
diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp
index f2993914a..0db2c735a 100644
--- a/packages/kokkos/core/unit_test/TestAtomics.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomics.hpp
@@ -405,9 +405,9 @@ T ExchLoop(int loop) {
 }
 
 template <class T>
-T ExchLoopSerial(
-    typename std::conditional<!std::is_same<T, Kokkos::complex<double> >::value,
-                              int, void>::type loop) {
+T ExchLoopSerial(std::conditional_t<
+                 !std::is_same<T, Kokkos::complex<double> >::value, int, void>
+                     loop) {
   T* data  = new T[1];
   T* data2 = new T[1];
   data[0]  = 0;
@@ -427,9 +427,9 @@ T ExchLoopSerial(
 }
 
 template <class T>
-T ExchLoopSerial(
-    typename std::conditional<std::is_same<T, Kokkos::complex<double> >::value,
-                              int, void>::type loop) {
+T ExchLoopSerial(std::conditional_t<
+                 std::is_same<T, Kokkos::complex<double> >::value, int, void>
+                     loop) {
   T* data  = new T[1];
   T* data2 = new T[1];
   data[0]  = 0;
diff --git a/packages/kokkos/core/unit_test/TestCXX11.hpp b/packages/kokkos/core/unit_test/TestCXX11.hpp
index bbe0d01cb..3dbce28ad 100644
--- a/packages/kokkos/core/unit_test/TestCXX11.hpp
+++ b/packages/kokkos/core/unit_test/TestCXX11.hpp
@@ -216,8 +216,7 @@ struct FunctorReduceTest {
   void init(value_type& update) const { update = 0.0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update,
-            volatile value_type const& input) const {
+  void join(value_type& update, value_type const& input) const {
     update += input;
   }
 };
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
index 513fb6aee..cd4298f8e 100644
--- a/packages/kokkos/core/unit_test/TestComplex.hpp
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -48,11 +48,6 @@
 
 namespace Test {
 
-#ifdef KOKKOS_COMPILER_NVHPC
-// warning: 'long double' is treated as 'double' in device code
-#pragma diag_suppress 20208
-#endif
-
 // Test construction and assignment
 
 template <class ExecSpace>
@@ -369,6 +364,10 @@ struct TestComplexSpecialFunctions {
     r = {1.380543138238714, 0.2925178131625636};
     ASSERT_FLOAT_EQ(h_results(17).real(), r.real());
     ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
+    // log10
+    r = std::log10(a);
+    ASSERT_FLOAT_EQ(h_results(18).real(), r.real());
+    ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
 #endif
   }
 
@@ -396,6 +395,7 @@ struct TestComplexSpecialFunctions {
     d_results(15) = Kokkos::asin(a);
     d_results(16) = Kokkos::acos(a);
     d_results(17) = Kokkos::atan(a);
+    d_results(18) = Kokkos::log10(a);
   }
 };
 
diff --git a/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp
index 5a7b8e4ba..9feac14d4 100644
--- a/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp
+++ b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp
@@ -47,7 +47,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 73db630b3..4500bd83b 100644
--- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -225,12 +225,12 @@ struct TestDeepCopyScalarConversion {
   using view_type_s1_2d = Kokkos::View<Scalar1**, Layout1, TEST_EXECSPACE>;
   using view_type_s2_2d = Kokkos::View<Scalar2**, Layout2, TEST_EXECSPACE>;
 
-  using base_layout1 = typename std::conditional<
-      std::is_same<Layout1, Kokkos::LayoutStride>::value, Kokkos::LayoutLeft,
-      Layout1>::type;
-  using base_layout2 = typename std::conditional<
-      std::is_same<Layout2, Kokkos::LayoutStride>::value, Kokkos::LayoutLeft,
-      Layout2>::type;
+  using base_layout1 =
+      std::conditional_t<std::is_same<Layout1, Kokkos::LayoutStride>::value,
+                         Kokkos::LayoutLeft, Layout1>;
+  using base_layout2 =
+      std::conditional_t<std::is_same<Layout2, Kokkos::LayoutStride>::value,
+                         Kokkos::LayoutLeft, Layout2>;
 
   using base_type_s1_1d = Kokkos::View<Scalar1*, base_layout1, TEST_EXECSPACE>;
   using base_type_s2_1d = Kokkos::View<Scalar2*, base_layout2, TEST_EXECSPACE>;
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 7ffa5aadd..d915b7e47 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -90,7 +90,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
       nthreads = omp_get_max_threads();
     }
 #elif defined(KOKKOS_ENABLE_HPX)
-    const auto concurrency = std::thread::hardware_concurrency();
+    const int concurrency = std::thread::hardware_concurrency();
     if (concurrency < nthreads) {
       nthreads = concurrency;
     }
@@ -165,7 +165,7 @@ Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa,
       nthreads = omp_get_max_threads();
     }
 #elif defined(KOKKOS_ENABLE_HPX)
-    const auto concurrency = std::thread::hardware_concurrency();
+    const int concurrency = std::thread::hardware_concurrency();
     if (concurrency < nthreads) {
       nthreads = concurrency;
     }
diff --git a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
index f8f5275d3..82228476e 100644
--- a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
+++ b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -43,7 +43,6 @@
 */
 
 #include <cstdio>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
index 8e4331e80..c9d2d275b 100644
--- a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
+++ b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
@@ -42,39 +42,41 @@
 //@HEADER
 */
 
-#include <cstdio>
-
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
 
-namespace Test {
-
 namespace {
 
-struct StructCopy {
+template <class ExecutionSpace>
+struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable {
   Kokkos::DefaultExecutionSpace device;
   Kokkos::DefaultHostExecutionSpace host;
+
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    auto copy = *this;
+    // not actually doing anything useful, mostly checking that
+    // ExecutionSpace::in_parallel() is callable
+    if (static_cast<int>(copy.device.in_parallel()) < 0) {
+      ++e;
+    }
+  }
+
+  CheckClassWithExecutionSpaceAsDataMemberIsCopyable() {
+    int errors;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this,
+                            errors);
+    EXPECT_EQ(errors, 0);
+  }
 };
 
-template <class ExecutionSpace>
-void check_struct_copy() {
-#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-  // FIXME_OPENMPTARGET nvlink error: Undefined reference to
-  // '_ZSt25__throw_bad_function_callv' in
-  // '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
+// FIXME_OPENMPTARGET nvlink error: Undefined reference to
+// '_ZSt25__throw_bad_function_callv' in
+// '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
 #ifndef KOKKOS_ENABLE_OPENMPTARGET
-  StructCopy data;
-  parallel_for(
-      Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) {
-        StructCopy data2 = data;
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("%i \n", data2.device.in_parallel());
-      });
-#endif
-#endif
+TEST(TEST_CATEGORY, execution_space_as_class_data_member) {
+  CheckClassWithExecutionSpaceAsDataMemberIsCopyable<TEST_EXECSPACE>();
 }
+#endif
 
 }  // namespace
-
-TEST(TEST_CATEGORY, copy_structure) { check_struct_copy<TEST_EXECSPACE>(); }
-}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
index d9e2486a4..5e0910b52 100644
--- a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
+++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
@@ -61,7 +61,7 @@ struct TestFunctorAnalysis_03 {
   void operator()(int, value_type&) const {}
 
   KOKKOS_INLINE_FUNCTION
-  void join(value_type volatile&, value_type const volatile&) const {}
+  void join(value_type&, value_type const&) const {}
 
   KOKKOS_INLINE_FUNCTION static void init(value_type&) {}
 };
@@ -75,11 +75,11 @@ void test_functor_analysis() {
                                     Kokkos::RangePolicy<ExecSpace>,
                                     decltype(c01)>;
 
-  using R01 = typename A01::template Reducer<typename ExecSpace::memory_space>;
+  using R01 = typename A01::Reducer;
 
-  static_assert(std::is_same<typename A01::value_type, void>::value, "");
-  static_assert(std::is_same<typename A01::pointer_type, void>::value, "");
-  static_assert(std::is_same<typename A01::reference_type, void>::value, "");
+  static_assert(std::is_void<typename A01::value_type>::value, "");
+  static_assert(std::is_void<typename A01::pointer_type>::value, "");
+  static_assert(std::is_void<typename A01::reference_type>::value, "");
   static_assert(std::is_same<typename R01::functor_type, decltype(c01)>::value,
                 "");
 
@@ -94,7 +94,7 @@ void test_functor_analysis() {
   using A02 = Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::REDUCE,
       Kokkos::RangePolicy<ExecSpace>, decltype(c02)>;
-  using R02 = typename A02::template Reducer<typename ExecSpace::memory_space>;
+  using R02 = typename A02::Reducer;
 
   static_assert(std::is_same<typename A02::value_type, double>::value, "");
   static_assert(std::is_same<typename A02::pointer_type, double*>::value, "");
@@ -114,7 +114,7 @@ void test_functor_analysis() {
   using A03 = Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::REDUCE,
       Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_03>;
-  using R03 = typename A03::template Reducer<typename ExecSpace::memory_space>;
+  using R03 = typename A03::Reducer;
 
   static_assert(std::is_same<typename A03::value_type,
                              TestFunctorAnalysis_03::value_type>::value,
diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
index 543ae506e..977a70465 100644
--- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -1003,7 +1003,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) {
   std::memcpy(c_arr, h_arr, n_bytes);
   for (i = 0; i < n_bytes; i++) ASSERT_EQ(c_arr[i], h_arr_ptr[i]);
 
-  std::memcpy(h_arr, c_arr, n_bytes);
   ASSERT_EQ(h_arr[0], h_arr0);
   ASSERT_EQ(h_arr[1], h_arr1);
 }
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp
index 731e9fc36..29bc45592 100644
--- a/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp
@@ -142,7 +142,7 @@ TEST(TEST_CATEGORY, host_shared_ptr_get) {
     HostSharedPtr<T> p2;
     p2 = p1;  // copy assignment
     EXPECT_EQ(p1.get(), &i);
-    EXPECT_EQ(p1.get(), &i);
+    EXPECT_EQ(p2.get(), &i);
   }
   {
     T i;
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 10180251b..9a4da5ddd 100644
--- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -42,6 +42,7 @@
 //@HEADER
 */
 
+#include <impl/Kokkos_StringManipulation.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
 #include <Kokkos_Core.hpp>
 
@@ -55,14 +56,9 @@ class Data {
   char d[64];
 
  public:
-  // Because strncpy is not supported within device code
-  static KOKKOS_FUNCTION void my_strncpy(char* dst, const char* src,
-                                         size_t cnt) {
-    while (cnt-- > 0 && (*dst++ = *src++) != '\0')
-      ;
-    while (cnt-- > 0) *dst++ = '\0';
+  KOKKOS_FUNCTION void write(char const* s) {
+    Kokkos::Impl::strncpy(d, s, sizeof(d));
   }
-  KOKKOS_FUNCTION void write(char const* s) { my_strncpy(d, s, sizeof(d)); }
 };
 
 template <class SmartPtr>
@@ -281,10 +277,14 @@ TEST(TEST_CATEGORY, host_shared_ptr_tracking) {
         Kokkos::Experimental::SYCLSharedUSMSpace>();
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) {
     host_shared_ptr_test_reference_counting<
         Kokkos::Experimental::HIPHostPinnedSpace,
         Kokkos::Experimental::HIPHostPinnedSpace>();
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::HIPManagedSpace,
+        Kokkos::Experimental::HIPManagedSpace>();
+  }
 #endif
 }
 
diff --git a/packages/kokkos/core/unit_test/TestInit.hpp b/packages/kokkos/core/unit_test/TestInit.hpp
index f124c6202..20536b0d3 100644
--- a/packages/kokkos/core/unit_test/TestInit.hpp
+++ b/packages/kokkos/core/unit_test/TestInit.hpp
@@ -43,7 +43,6 @@
 */
 
 #include <cstdio>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestInitializationSettings.cpp b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp
new file mode 100644
index 000000000..a5b11c5a3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <impl/Kokkos_InitializationSettings.hpp>
+
+namespace {
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+void take_initialization_settings(Kokkos::InitializationSettings const&) {}
+
+TEST(defaultdevicetype,
+     init_arguments_implicit_conversion_to_initialization_settings) {
+  Kokkos::InitArguments arguments;
+  take_initialization_settings(arguments);  // check that conversion is implicit
+  arguments.device_id      = 1;
+  arguments.tune_internals = true;
+  Kokkos::InitializationSettings settings{arguments};
+  EXPECT_FALSE(settings.has_num_threads());
+  EXPECT_TRUE(settings.has_device_id());
+  EXPECT_EQ(settings.get_device_id(), 1);
+  EXPECT_FALSE(settings.has_num_devices());
+  EXPECT_FALSE(settings.has_skip_device());
+  EXPECT_FALSE(settings.has_disable_warnings());
+  EXPECT_TRUE(settings.has_tune_internals());
+  EXPECT_TRUE(settings.get_tune_internals());
+  EXPECT_FALSE(settings.has_tools_help());
+  EXPECT_FALSE(settings.has_tools_libs());
+  EXPECT_FALSE(settings.has_tools_args());
+}
+#endif
+
+TEST(defaultdevicetype, initialization_settings) {
+  auto const settings = Kokkos::InitializationSettings()
+                            .set_num_threads(255)
+                            .set_disable_warnings(false)
+                            .set_tools_libs("my_custom_tool.so");
+  EXPECT_TRUE(settings.has_num_threads());
+  EXPECT_EQ(settings.get_num_threads(), 255);
+  EXPECT_FALSE(settings.has_device_id());
+  EXPECT_FALSE(settings.has_num_devices());
+  EXPECT_FALSE(settings.has_skip_device());
+  EXPECT_TRUE(settings.has_disable_warnings());
+  EXPECT_FALSE(settings.get_disable_warnings());
+  EXPECT_FALSE(settings.has_tune_internals());
+  EXPECT_FALSE(settings.has_tools_help());
+  EXPECT_TRUE(settings.has_tools_libs());
+  EXPECT_EQ(settings.get_tools_libs(), "my_custom_tool.so");
+  EXPECT_FALSE(settings.has_tools_args());
+}
+
+#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, "")  // FIXME C++17
+
+constexpr bool test_initialization_settings_getter() {
+#define CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(NAME, TYPE)           \
+  STATIC_ASSERT(std::is_same<                                                  \
+                decltype(std::declval<Kokkos::InitializationSettings const&>() \
+                             .has_##NAME()),                                   \
+                bool>::value);                                                 \
+  STATIC_ASSERT(std::is_same<                                                  \
+                decltype(std::declval<Kokkos::InitializationSettings const&>() \
+                             .get_##NAME()),                                   \
+                TYPE>::value);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_libs, std::string);
+  CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_args, std::string);
+#undef CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE
+  return true;
+}
+
+STATIC_ASSERT(test_initialization_settings_getter());
+
+STATIC_ASSERT(
+    std::is_default_constructible<Kokkos::InitializationSettings>::value);
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestInterOp.cpp b/packages/kokkos/core/unit_test/TestInterOp.cpp
index 7f08afada..50238a93a 100644
--- a/packages/kokkos/core/unit_test/TestInterOp.cpp
+++ b/packages/kokkos/core/unit_test/TestInterOp.cpp
@@ -50,9 +50,10 @@
 static_assert(
     std::is_same<
         Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>,
-        Kokkos::View<
-            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
-            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+        Kokkos::View<double*,
+                     typename Kokkos::DefaultExecutionSpace::array_layout,
+                     typename Kokkos::DefaultExecutionSpace::memory_space,
+                     Kokkos::Experimental::DefaultViewHooks>>::value,
     "Error! Unexpected python_view_type for: View");
 
 // DynRankView
@@ -69,9 +70,10 @@ static_assert(
     std::is_same<
         Kokkos::Experimental::python_view_type_t<
             Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>,
-        Kokkos::View<
-            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
-            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+        Kokkos::View<double*,
+                     typename Kokkos::DefaultExecutionSpace::array_layout,
+                     typename Kokkos::DefaultExecutionSpace::memory_space,
+                     Kokkos::Experimental::DefaultViewHooks>>::value,
     "Error! Unexpected python_view_type for: View + Execution Space");
 
 // DynRankView + Execution Space
@@ -85,11 +87,12 @@ static_assert(
     "Error! Unexpected python_view_type for: DynRankView + Execution Space");
 
 // View + Memory space
-static_assert(std::is_same<Kokkos::Experimental::python_view_type_t<
-                               Kokkos::View<int64_t*, Kokkos::HostSpace>>,
-                           Kokkos::View<int64_t*, Kokkos::LayoutRight,
-                                        Kokkos::HostSpace>>::value,
-              "Error! Unexpected python_view_type for: View + Memory space");
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<
+                     Kokkos::View<int64_t*, Kokkos::HostSpace>>,
+                 Kokkos::View<int64_t*, Kokkos::LayoutRight, Kokkos::HostSpace,
+                              Kokkos::Experimental::DefaultViewHooks>>::value,
+    "Error! Unexpected python_view_type for: View + Memory space");
 
 // DynRankView + Memory space
 static_assert(
@@ -105,8 +108,8 @@ static_assert(
         Kokkos::Experimental::python_view_type_t<Kokkos::View<
             int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
         Kokkos::View<int**, Kokkos::LayoutLeft,
-                     typename Kokkos::DefaultExecutionSpace::memory_space>>::
-        value,
+                     typename Kokkos::DefaultExecutionSpace::memory_space,
+                     Kokkos::Experimental::DefaultViewHooks>>::value,
     "Error! Unexpected python_view_type for: View + Layout + Execution space");
 
 // DynRankView + Layout + Execution space
@@ -121,10 +124,10 @@ static_assert(
 
 // View + Layout + Memory Space
 static_assert(
-    std::is_same<
-        Kokkos::Experimental::python_view_type_t<
-            Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
-        Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>::value,
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::View<
+                     uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+                 Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                              Kokkos::Experimental::DefaultViewHooks>>::value,
     "Error! Unexpected python_view_type for: View + Layout + Memory Space");
 
 // DynRankView + Layout + Memory Space
@@ -144,6 +147,7 @@ static_assert(
             Kokkos::MemoryTraits<Kokkos::RandomAccess>>>,
         Kokkos::View<float***, Kokkos::LayoutLeft,
                      typename Kokkos::DefaultHostExecutionSpace::memory_space,
+                     Kokkos::Experimental::DefaultViewHooks,
                      Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value,
     "Error! Unexpected python_view_type for: View + Layout + Execution space + "
     "Memory Trait");
diff --git a/packages/kokkos/core/unit_test/TestIrregularLayout.hpp b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp
index 86f9353e2..af4fd2245 100644
--- a/packages/kokkos/core/unit_test/TestIrregularLayout.hpp
+++ b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #define OFFSET_LIST_MAX_SIZE 100
diff --git a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
new file mode 100644
index 000000000..ab5ccd003
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+
+namespace {
+
+enum MyErrorCode {
+  no_error                           = 0b000,
+  error_operator_plus_equal          = 0b001,
+  error_operator_plus_equal_volatile = 0b010,
+  error_join_volatile                = 0b100
+
+};
+
+KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs,
+                                                MyErrorCode rhs) {
+  return static_cast<MyErrorCode>(static_cast<int>(lhs) |
+                                  static_cast<int>(rhs));
+}
+
+static_assert((no_error | error_operator_plus_equal_volatile) ==
+                  error_operator_plus_equal_volatile,
+              "");
+static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, "");
+
+struct MyJoinBackCompatValueType {
+  MyErrorCode err = no_error;
+};
+
+KOKKOS_FUNCTION void operator+=(MyJoinBackCompatValueType &x,
+                                const MyJoinBackCompatValueType &y) {
+  x.err = x.err | y.err | error_operator_plus_equal;
+}
+
+KOKKOS_FUNCTION void operator+=(volatile MyJoinBackCompatValueType &x,
+                                const volatile MyJoinBackCompatValueType &y) {
+  x.err = x.err | y.err | error_operator_plus_equal_volatile;
+}
+
+struct ReducerWithJoinThatTakesNonVolatileQualifiedArgs {
+  using reducer    = ReducerWithJoinThatTakesNonVolatileQualifiedArgs;
+  using value_type = MyJoinBackCompatValueType;
+  KOKKOS_FUNCTION void join(MyJoinBackCompatValueType &x,
+                            MyJoinBackCompatValueType const &y) const {
+    x.err = x.err | y.err;
+  }
+  KOKKOS_FUNCTION void operator()(int, MyJoinBackCompatValueType &) const {}
+  KOKKOS_FUNCTION
+  ReducerWithJoinThatTakesNonVolatileQualifiedArgs() {}
+};
+
+struct ReducerWithJoinThatTakesBothVolatileAndNonVolatileQualifiedArgs {
+  using reducer =
+      ReducerWithJoinThatTakesBothVolatileAndNonVolatileQualifiedArgs;
+  using value_type = MyJoinBackCompatValueType;
+  KOKKOS_FUNCTION void join(MyJoinBackCompatValueType &x,
+                            MyJoinBackCompatValueType const &y) const {
+    x.err = x.err | y.err;
+  }
+  KOKKOS_FUNCTION void join(MyJoinBackCompatValueType volatile &x,
+                            MyJoinBackCompatValueType const volatile &y) const {
+    x.err = x.err | y.err | error_join_volatile;
+  }
+  KOKKOS_FUNCTION void operator()(int, MyJoinBackCompatValueType &) const {}
+  KOKKOS_FUNCTION
+  ReducerWithJoinThatTakesBothVolatileAndNonVolatileQualifiedArgs() {}
+};
+
+struct ReducerWithJoinThatTakesVolatileQualifiedArgs {
+  using reducer    = ReducerWithJoinThatTakesVolatileQualifiedArgs;
+  using value_type = MyJoinBackCompatValueType;
+  KOKKOS_FUNCTION void join(MyJoinBackCompatValueType volatile &x,
+                            MyJoinBackCompatValueType const volatile &y) const {
+    x.err = x.err | y.err;
+  }
+  KOKKOS_FUNCTION void operator()(int, MyJoinBackCompatValueType &) const {}
+  KOKKOS_FUNCTION ReducerWithJoinThatTakesVolatileQualifiedArgs() {}
+};
+
+void test_join_backward_compatibility() {
+  MyJoinBackCompatValueType result;
+  Kokkos::RangePolicy<> policy(0, 1);
+
+#if defined KOKKOS_ENABLE_DEPRECATED_CODE_3
+  Kokkos::parallel_reduce(
+      policy, ReducerWithJoinThatTakesVolatileQualifiedArgs{}, result);
+  ASSERT_EQ(result.err, no_error);
+#endif
+
+  Kokkos::parallel_reduce(
+      policy, ReducerWithJoinThatTakesBothVolatileAndNonVolatileQualifiedArgs{},
+      result);
+  ASSERT_EQ(result.err, no_error);
+  Kokkos::parallel_reduce(
+      policy, ReducerWithJoinThatTakesNonVolatileQualifiedArgs{}, result);
+  ASSERT_EQ(result.err, no_error);
+
+  // avoid warnings unused function 'operator+='
+  result += {};
+  ASSERT_EQ(result.err, error_operator_plus_equal);
+  static_cast<MyJoinBackCompatValueType volatile &>(result) +=
+      static_cast<MyJoinBackCompatValueType const volatile &>(result);
+  ASSERT_EQ(result.err,
+            error_operator_plus_equal | error_operator_plus_equal_volatile);
+}
+
+TEST(TEST_CATEGORY, join_backward_compatibility) {
+  test_join_backward_compatibility();
+}
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestLegionInteroperability.cpp b/packages/kokkos/core/unit_test/TestLegionInteroperability.cpp
new file mode 100644
index 000000000..183855845
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestLegionInteroperability.cpp
@@ -0,0 +1,159 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined(KOKKOS_COMPILER_INTEL) && (KOKKOS_COMPILER_INTEL < 1800)
+
+namespace {
+
+// error: expression must have a constant value
+//   std::enable_if_t<!has_deprecated_cuda_impl_initialize_v<T>>
+constexpr bool
+test_compiler_upgrade_needed_for_detection_idiom_and_variable_template() {
+  return true;
+}
+static_assert(
+    test_compiler_upgrade_needed_for_detection_idiom_and_variable_template(),
+    "Intel C++ compiler is awesome");
+
+}  // namespace
+
+#else
+
+// The purpose of this compile-only test is twofold:
+// 1. mimic Legion's use of Kokkos implementation details for initializing the
+//    exectution environment
+// 2. demonstrate how to leverage SFINAE to support Kokkos version through the
+//    ExecutionSpace::impl_initialize breaking change before release 3.7
+namespace {
+#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, "")  // FIXME C++17
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <class T>
+using deprecated_cuda_impl_initialize_t =
+    decltype(T::impl_initialize(typename T::SelectDevice(0), 1));
+
+template <class T>
+constexpr bool has_deprecated_cuda_impl_initialize_v =
+    Kokkos::is_detected<deprecated_cuda_impl_initialize_t, T>::value;
+
+template <class T>
+std::enable_if_t<has_deprecated_cuda_impl_initialize_v<T> >
+legion_initialize_kokkos_cuda() {
+  int cuda_device_id = 0;
+  int num_instances  = 1;
+  T::impl_initialize(typename T::SelectDevice(cuda_device_id), num_instances);
+}
+
+template <class T>
+std::enable_if_t<!has_deprecated_cuda_impl_initialize_v<T> >
+legion_initialize_kokkos_cuda() {
+  int cuda_device_id = 0;
+  auto const settings =
+      Kokkos::InitializationSettings().set_device_id(cuda_device_id);
+  T::impl_initialize(settings);
+}
+
+STATIC_ASSERT(std::is_void<
+              decltype(legion_initialize_kokkos_cuda<Kokkos::Cuda>())>::value);
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+template <class T>
+using deprecated_openmp_impl_initialize_t = decltype(T::impl_initialize(0));
+
+template <class T>
+constexpr bool has_deprecated_openmp_impl_initialize_v =
+    Kokkos::is_detected<deprecated_openmp_impl_initialize_t, T>::value;
+
+template <class T>
+std::enable_if_t<has_deprecated_openmp_impl_initialize_v<T> >
+legion_initialize_kokkos_openmp() {
+  int thread_count = -1;
+  T::impl_initialize(thread_count);
+}
+
+template <class T>
+std::enable_if_t<!has_deprecated_openmp_impl_initialize_v<T> >
+legion_initialize_kokkos_openmp() {
+  int thread_count = -1;
+  auto const settings =
+      Kokkos::InitializationSettings().set_num_threads(thread_count);
+  T::impl_initialize(settings);
+}
+
+STATIC_ASSERT(std::is_void<decltype(
+                  legion_initialize_kokkos_openmp<Kokkos::OpenMP>())>::value);
+
+#endif
+
+#ifdef KOKKOS_ENABLE_SERIAL
+template <class T>
+using deprecated_serial_impl_initialize_t = decltype(T::impl_initialize());
+
+template <class T>
+constexpr bool has_deprecated_serial_impl_initialize_v =
+    Kokkos::is_detected<deprecated_serial_impl_initialize_t, T>::value;
+
+template <class T>
+std::enable_if_t<has_deprecated_serial_impl_initialize_v<T> >
+legion_initialize_kokkos_serial() {
+  T::impl_initialize();
+}
+
+template <class T>
+std::enable_if_t<!has_deprecated_serial_impl_initialize_v<T> >
+legion_initialize_kokkos_serial() {
+  Kokkos::InitializationSettings settings;
+  T::impl_initialize(settings);
+}
+
+STATIC_ASSERT(std::is_void<decltype(
+                  legion_initialize_kokkos_serial<Kokkos::Serial>())>::value);
+#endif
+
+}  // namespace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
index 80feb11f9..cdb14fec5 100644
--- a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
+++ b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <time.h>
diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp
index 5ff87f8d9..e0a880a7a 100644
--- a/packages/kokkos/core/unit_test/TestMDRange.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange.hpp
@@ -70,21 +70,6 @@ struct TestMDRange_ReduceArray_2D {
                              const unsigned array_size)
       : input_view("input_view", N0, N1), value_count(array_size) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void init(scalar_type dst[]) const {
-    for (unsigned i = 0; i < value_count; ++i) {
-      dst[i] = 0.0;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile scalar_type dst[],
-            const volatile scalar_type src[]) const {
-    for (unsigned i = 0; i < value_count; ++i) {
-      dst[i] += src[i];
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, const int j) const { input_view(i, j) = 1; }
 
@@ -167,8 +152,7 @@ struct TestMDRange_ReduceArray_3D {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile scalar_type dst[],
-            const volatile scalar_type src[]) const {
+  void join(scalar_type dst[], const scalar_type src[]) const {
     for (unsigned i = 0; i < value_count; ++i) {
       dst[i] += src[i];
     }
@@ -385,7 +369,7 @@ struct TestMDRange_2D {
       parallel_reduce(
           "rank2-min-reducer", range,
           KOKKOS_LAMBDA(const int i, const int j, double &min_val) {
-            min_val = Kokkos::Experimental::fmin(v_in(i, j), min_val);
+            min_val = Kokkos::fmin(v_in(i, j), min_val);
           },
           reducer_scalar);
 
@@ -3902,14 +3886,6 @@ struct TestMDRange_ReduceScalar {
     void operator+=(const Scalar &src) {
       for (int i = 0; i < 4; i++) v[i] += src.v[i];
     }
-    KOKKOS_INLINE_FUNCTION
-    void operator=(const volatile Scalar &src) volatile {
-      for (int i = 0; i < 4; i++) v[i] = src.v[i];
-    }
-    KOKKOS_INLINE_FUNCTION
-    void operator+=(const volatile Scalar &src) volatile {
-      for (int i = 0; i < 4; i++) v[i] += src.v[i];
-    }
   };
 
   static void test_scalar_reduce(const int N0, const int N1) {
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index 0e1514a33..7294f8e62 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -48,13 +48,12 @@
 #include <algorithm>
 #include <initializer_list>
 #include <type_traits>
-#include "Kokkos_ExecPolicy.hpp"
-#include "Kokkos_Parallel_Reduce.hpp"
 
 #include <cfloat>
 
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
-    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) ||          \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \
+    defined(KOKKOS_ENABLE_OPENACC)
 #else
 #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #endif
@@ -239,14 +238,14 @@ struct FloatingPointComparison {
 
   // Using absolute here instead of abs, since we actually test abs ...
   template <class T>
-  KOKKOS_FUNCTION typename std::enable_if<std::is_signed<T>::value, T>::type
-  absolute(T val) const {
+  KOKKOS_FUNCTION std::enable_if_t<std::is_signed<T>::value, T> absolute(
+      T val) const {
     return val < T(0) ? -val : val;
   }
 
   template <class T>
-  KOKKOS_FUNCTION typename std::enable_if<!std::is_signed<T>::value, T>::type
-  absolute(T val) const {
+  KOKKOS_FUNCTION std::enable_if_t<!std::is_signed<T>::value, T> absolute(
+      T val) const {
     return val;
   }
 
@@ -257,10 +256,9 @@ struct FloatingPointComparison {
 
     bool ar = absolute(fpv) < abs_tol;
     if (!ar) {
-#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
-      printf("absolute value exceeds tolerance [|%e| > %e]\n", (double)fpv,
-             abs_tol);
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "absolute value exceeds tolerance [|%e| > %e]\n", (double)fpv,
+          abs_tol);
     }
 
     return ar;
@@ -279,12 +277,11 @@ struct FloatingPointComparison {
       double min_denom = static_cast<double>(
           absolute(rhs) < absolute(lhs) ? absolute(rhs) : absolute(lhs));
       double rel_diff = abs_diff / min_denom;
-      bool ar         = rel_diff < rel_tol;
+      bool ar         = abs_diff == 0 || rel_diff < rel_tol;
       if (!ar) {
-#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
-        printf("relative difference exceeds tolerance [%e > %e]\n",
-               (double)rel_diff, rel_tol);
-#endif
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "relative difference exceeds tolerance [%e > %e]\n",
+            (double)rel_diff, rel_tol);
       }
 
       return ar;
@@ -299,10 +296,10 @@ struct math_function_name;
   struct MathUnaryFunction_##FUNC {                                            \
     template <typename T>                                                      \
     static KOKKOS_FUNCTION auto eval(T x) {                                    \
-      static_assert(std::is_same<decltype(Kokkos::Experimental::FUNC((T)0)),   \
+      static_assert(std::is_same<decltype(Kokkos::FUNC((T)0)),                 \
                                  math_unary_function_return_type_t<T>>::value, \
                     "");                                                       \
-      return Kokkos::Experimental::FUNC(x);                                    \
+      return Kokkos::FUNC(x);                                                  \
     }                                                                          \
     template <typename T>                                                      \
     static auto eval_std(T x) {                                                \
@@ -320,11 +317,17 @@ struct math_function_name;
   };                                                                           \
   constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[]
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
 // Generally the expected ULP error should come from here:
 // https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html
 // For now 1s largely seem to work ...
 DEFINE_UNARY_FUNCTION_EVAL(exp, 2);
+#ifdef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC exp2 not device callable,
+                              // workaround computes it via exp
+DEFINE_UNARY_FUNCTION_EVAL(exp2, 30);
+#else
 DEFINE_UNARY_FUNCTION_EVAL(exp2, 2);
+#endif
 DEFINE_UNARY_FUNCTION_EVAL(expm1, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log, 2);
 DEFINE_UNARY_FUNCTION_EVAL(log10, 2);
@@ -347,7 +350,9 @@ DEFINE_UNARY_FUNCTION_EVAL(tanh, 2);
 DEFINE_UNARY_FUNCTION_EVAL(asinh, 4);
 DEFINE_UNARY_FUNCTION_EVAL(acosh, 2);
 DEFINE_UNARY_FUNCTION_EVAL(atanh, 2);
+#endif
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
 #if defined(__APPLE__)
 // Apple's standard library implementation seems to have a poor implementation
 DEFINE_UNARY_FUNCTION_EVAL(erf, 5);
@@ -365,10 +370,14 @@ DEFINE_UNARY_FUNCTION_EVAL(lgamma, 2);
 DEFINE_UNARY_FUNCTION_EVAL(ceil, 2);
 DEFINE_UNARY_FUNCTION_EVAL(floor, 2);
 DEFINE_UNARY_FUNCTION_EVAL(trunc, 2);
+DEFINE_UNARY_FUNCTION_EVAL(round, 1);
 #ifndef KOKKOS_ENABLE_SYCL
 DEFINE_UNARY_FUNCTION_EVAL(nearbyint, 2);
 #endif
 
+DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
+#endif
+
 #undef DEFINE_UNARY_FUNCTION_EVAL
 
 #define DEFINE_BINARY_FUNCTION_EVAL(FUNC, ULP_FACTOR)                    \
@@ -376,10 +385,10 @@ DEFINE_UNARY_FUNCTION_EVAL(nearbyint, 2);
     template <typename T, typename U>                                    \
     static KOKKOS_FUNCTION auto eval(T x, U y) {                         \
       static_assert(                                                     \
-          std::is_same<decltype(Kokkos::Experimental::FUNC((T)0, (U)0)), \
+          std::is_same<decltype(Kokkos::FUNC((T)0, (U)0)),               \
                        math_binary_function_return_type_t<T, U>>::value, \
           "");                                                           \
-      return Kokkos::Experimental::FUNC(x, y);                           \
+      return Kokkos::FUNC(x, y);                                         \
     }                                                                    \
     template <typename T, typename U>                                    \
     static auto eval_std(T x, U y) {                                     \
@@ -398,8 +407,14 @@ DEFINE_UNARY_FUNCTION_EVAL(nearbyint, 2);
   };                                                                     \
   constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
 DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
 DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
+#endif
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
+DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1);
+DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
+#endif
 
 #undef DEFINE_BINARY_FUNCTION_EVAL
 
@@ -443,10 +458,9 @@ struct TestMathUnaryFunction : FloatingPointComparison {
     bool ar = compare(Func::eval(val_[i]), res_[i], Func::ulp_factor());
     if (!ar) {
       ++e;
-#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
-      printf("value at %f which is %f was expected to be %f\n", (double)val_[i],
-             (double)Func::eval(val_[i]), (double)res_[i]);
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "value at %f which is %f was expected to be %f\n", (double)val_[i],
+          (double)Func::eval(val_[i]), (double)res_[i]);
     }
   }
 };
@@ -482,11 +496,9 @@ struct TestMathBinaryFunction : FloatingPointComparison {
     bool ar = compare(Func::eval(val1_, val2_), res_, Func::ulp_factor());
     if (!ar) {
       ++e;
-#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP)
-      printf("value at %f, %f which is %f was expected to be %f\n",
-             (double)val1_, (double)val2_, (double)Func::eval(val1_, val2_),
-             (double)res_);
-#endif
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "value at %f, %f which is %f was expected to be %f\n", (double)val1_,
+          (double)val2_, (double)Func::eval(val1_, val2_), (double)res_);
     }
   }
 };
@@ -497,6 +509,8 @@ void do_test_math_binary_function(Arg1 arg1, Arg2 arg2) {
       (TestMathBinaryFunction<Space, Func, Arg1, Arg2>(arg1, arg2), 0)...};
 }
 
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+
 TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
   TEST_MATH_FUNCTION(sin)({true, false});
   TEST_MATH_FUNCTION(sin)({-3, -2, -1, 0, 1});
@@ -785,6 +799,9 @@ TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) {
   TEST_MATH_FUNCTION(atanh)({-.97l, .86l, -.53l, .42l, -.1l, 0.l});
 #endif
 }
+#endif
+
+#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
 
 TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) {
   TEST_MATH_FUNCTION(erf)({-3, -2, -1, 0, 1});
@@ -874,6 +891,18 @@ TEST(TEST_CATEGORY,
   TEST_MATH_FUNCTION(trunc)({12.3l, 4.56l, 789.l});
 #endif
 
+  TEST_MATH_FUNCTION(round)({-3, -2, -1, 0, 1});
+  TEST_MATH_FUNCTION(round)({-3l, -2l, -1l, 0l, 1l});
+  TEST_MATH_FUNCTION(round)({-3ll, -2ll, -1ll, 0ll, 1ll});
+  TEST_MATH_FUNCTION(round)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(round)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(round)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(round)({2.3f, 2.5f, 2.7f, -2.3f, -2.5f, -2.7f, -0.0f});
+  TEST_MATH_FUNCTION(round)({2.3, 2.5, 2.7, -2.3, -2.5, -2.7, -0.0});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(round)({2.3l, 2.5l, 2.7l, -2.3l, -2.5l, -2.7l, -0.0l});
+#endif
+
 #ifndef KOKKOS_ENABLE_SYCL
   TEST_MATH_FUNCTION(nearbyint)({-3, -2, -1, 0, 1});
   TEST_MATH_FUNCTION(nearbyint)({-3l, -2l, -1l, 0l, 1l});
@@ -889,6 +918,43 @@ TEST(TEST_CATEGORY,
 #endif
 }
 
+TEST(TEST_CATEGORY,
+     mathematical_functions_floating_point_manipulation_functions) {
+  TEST_MATH_FUNCTION(logb)({2, 3, 4, 56, 789});
+  TEST_MATH_FUNCTION(logb)({2l, 3l, 4l, 56l, 789l});
+  TEST_MATH_FUNCTION(logb)({2ll, 3ll, 4ll, 56ll, 789ll});
+  TEST_MATH_FUNCTION(logb)({2u, 3u, 4u, 5u, 6u});
+  TEST_MATH_FUNCTION(logb)({2ul, 3ul, 4ul, 5ul, 6ul});
+  TEST_MATH_FUNCTION(logb)({2ull, 3ull, 4ull, 5ull, 6ull});
+  TEST_MATH_FUNCTION(logb)({123.45f, 6789.0f});
+  TEST_MATH_FUNCTION(logb)({123.45, 6789.0});
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  TEST_MATH_FUNCTION(logb)({123.45l, 6789.0l});
+#endif
+
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(0, 1.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(1, 2.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(0.1, 0);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(1, 2.l);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_nextafter>(1.l, 2.l);
+#endif
+
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(0, 1.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1, 2.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(0.1, 0);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1.f, +2.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1.f, -2.f);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1., +2.);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1., -2.);
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1, +2.l);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1.l, +2);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1.l, +2.l);
+  do_test_math_binary_function<TEST_EXECSPACE, kk_copysign>(1.l, -2.l);
+#endif
+}
+
 template <class Space>
 struct TestAbsoluteValueFunction {
   TestAbsoluteValueFunction() { run(); }
@@ -898,7 +964,7 @@ struct TestAbsoluteValueFunction {
     ASSERT_EQ(errors, 0);
   }
   KOKKOS_FUNCTION void operator()(int, int& e) const {
-    using Kokkos::Experimental::abs;
+    using Kokkos::abs;
     if (abs(1) != 1 || abs(-1) != 1) {
       ++e;
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
@@ -926,8 +992,8 @@ struct TestAbsoluteValueFunction {
     }
 #endif
     // special values
-    using Kokkos::Experimental::isinf;
-    using Kokkos::Experimental::isnan;
+    using Kokkos::isinf;
+    using Kokkos::isnan;
     if (abs(-0.) != 0.
 #ifndef KOKKOS_IMPL_WORKAROUND_INTEL_LLVM_DEFAULT_FLOATING_POINT_MODEL
         || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
@@ -962,7 +1028,7 @@ struct TestIsNaN {
     ASSERT_EQ(errors, 0);
   }
   KOKKOS_FUNCTION void operator()(int, int& e) const {
-    using Kokkos::Experimental::isnan;
+    using Kokkos::isnan;
     using Kokkos::Experimental::quiet_NaN;
     using Kokkos::Experimental::signaling_NaN;
     if (isnan(1) || isnan(INT_MAX)) {
@@ -1022,3 +1088,4 @@ struct TestIsNaN {
 TEST(TEST_CATEGORY, mathematical_functions_isnan) {
   TestIsNaN<TEST_EXECSPACE>();
 }
+#endif
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
new file mode 100644
index 000000000..0fd56a5ac
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions1.hpp
@@ -0,0 +1,47 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
+#include "TestMathematicalFunctions.hpp"
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
new file mode 100644
index 000000000..74e7443b1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions2.hpp
@@ -0,0 +1,47 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
+#include "TestMathematicalFunctions.hpp"
+#undef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
index 45d8bd08a..26f237a71 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -17,7 +17,7 @@ struct TestExponentialIntergral1Function {
   HostViewType h_ref;
 
   void testit() {
-    using Kokkos::Experimental::fabs;
+    using Kokkos::fabs;
     using Kokkos::Experimental::infinity;
 
     d_x      = ViewType("d_x", 15);
@@ -1641,27 +1641,17 @@ struct TestComplexBesselH1Function {
         Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
 
     EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
-    std::cout << "h_ch10(0): " << h_ch10(0)
-              << ", h_ref_ch10(0): " << h_ref_ch10(0) << std::endl;
     for (int i = 1; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
-                Kokkos::abs(h_ref_ch10(i)) * 1e-13);
-      std::cout << i
-                << ", actual diff: " << Kokkos::abs(h_ch10(i) - h_ref_ch10(i))
-                << ", expected diff: " << Kokkos::abs(h_ref_ch10(i)) * 1e-13
-                << std::endl;
+                Kokkos::abs(h_ref_ch10(i)) * 1e-13)
+          << "at index " << i;
     }
 
     EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
-    std::cout << "h_ch11(0): " << h_ch11(0)
-              << ", h_ref_ch11(0): " << h_ref_ch11(0) << std::endl;
     for (int i = 1; i < N; i++) {
       EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
-                Kokkos::abs(h_ref_ch11(i)) * 1e-13);
-      std::cout << i
-                << ", actual diff: " << Kokkos::abs(h_ch11(i) - h_ref_ch11(i))
-                << ", expected diff: " << Kokkos::abs(h_ref_ch11(i)) * 1e-13
-                << std::endl;
+                Kokkos::abs(h_ref_ch11(i)) * 1e-13)
+          << "at index " << i;
     }
   }
 
diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
index 829e8d641..75deae13a 100644
--- a/packages/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -45,12 +45,7 @@
 #ifndef KOKKOS_UNITTEST_MEMPOOL_HPP
 #define KOKKOS_UNITTEST_MEMPOOL_HPP
 
-#include <cstdio>
-#include <iostream>
-#include <cmath>
-#include <algorithm>
-
-#include <Kokkos_Timer.hpp>
+#include <Kokkos_Core.hpp>
 
 namespace TestMemoryPool {
 
@@ -489,8 +484,8 @@ struct TestMemoryPoolHuge {
 template <class DeviceType>
 struct TestMemoryPoolHuge<
     DeviceType,
-    typename std::enable_if<std::is_same<
-        Kokkos::HostSpace, typename DeviceType::memory_space>::value>::type> {
+    std::enable_if_t<std::is_same<Kokkos::HostSpace,
+                                  typename DeviceType::memory_space>::value>> {
   using ptrs_type    = Kokkos::View<uintptr_t*, DeviceType>;
   using pool_type    = Kokkos::MemoryPool<DeviceType>;
   using memory_space = typename DeviceType::memory_space;
diff --git a/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp b/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
index abf24ef97..42f83f348 100644
--- a/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
+++ b/packages/kokkos/core/unit_test/TestMinMaxClamp.hpp
@@ -72,32 +72,30 @@ struct PairIntCompareFirst {
 // test max()
 // ----------------------------------------------------------
 TEST(TEST_CATEGORY, max) {
-  namespace KE = Kokkos::Experimental;
-
   int a = 1;
   int b = 2;
-  EXPECT_TRUE(KE::max(a, b) == 2);
+  EXPECT_EQ(Kokkos::max(a, b), 2);
 
   a = 3;
   b = 1;
-  EXPECT_TRUE(KE::max(a, b) == 3);
+  EXPECT_EQ(Kokkos::max(a, b), 3);
 
-  STATIC_ASSERT(KE::max(1, 2) == 2);
-  STATIC_ASSERT(KE::max(1, 2, ::Test::Greater<int>{}) == 1);
+  STATIC_ASSERT(Kokkos::max(1, 2) == 2);
+  STATIC_ASSERT(Kokkos::max(1, 2, ::Test::Greater<int>{}) == 1);
 
-  EXPECT_TRUE(KE::max({3.f, -1.f, 0.f}) == 3.f);
+  EXPECT_EQ(Kokkos::max({3.f, -1.f, 0.f}), 3.f);
 
-  STATIC_ASSERT(KE::max({3, -1, 0}) == 3);
-  STATIC_ASSERT(KE::max({3, -1, 0}, ::Test::Greater<int>{}) == -1);
+  STATIC_ASSERT(Kokkos::max({3, -1, 0}) == 3);
+  STATIC_ASSERT(Kokkos::max({3, -1, 0}, ::Test::Greater<int>{}) == -1);
 
-  STATIC_ASSERT(KE::max({
-                            ::Test::PairIntCompareFirst{255, 0},
-                            ::Test::PairIntCompareFirst{255, 1},
-                            ::Test::PairIntCompareFirst{0, 2},
-                            ::Test::PairIntCompareFirst{0, 3},
-                            ::Test::PairIntCompareFirst{255, 4},
-                            ::Test::PairIntCompareFirst{0, 5},
-                        })
+  STATIC_ASSERT(Kokkos::max({
+                                ::Test::PairIntCompareFirst{255, 0},
+                                ::Test::PairIntCompareFirst{255, 1},
+                                ::Test::PairIntCompareFirst{0, 2},
+                                ::Test::PairIntCompareFirst{0, 3},
+                                ::Test::PairIntCompareFirst{255, 4},
+                                ::Test::PairIntCompareFirst{0, 5},
+                            })
                     .second == 0);  // leftmost element
 }
 
@@ -107,9 +105,8 @@ struct StdAlgoMinMaxOpsTestMax {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int& ind) const {
-    namespace KE = Kokkos::Experimental;
-    auto v1      = 10.;
-    if (KE::max(v1, m_view(ind)) == 10.) {
+    auto v1 = 10.;
+    if (Kokkos::max(v1, m_view(ind)) == 10.) {
       m_view(ind) = 6.;
     }
   }
@@ -136,32 +133,30 @@ TEST(TEST_CATEGORY, max_within_parfor) {
 // test min()
 // ----------------------------------------------------------
 TEST(TEST_CATEGORY, min) {
-  namespace KE = Kokkos::Experimental;
-
   int a = 1;
   int b = 2;
-  EXPECT_TRUE(KE::min(a, b) == 1);
+  EXPECT_EQ(Kokkos::min(a, b), 1);
 
   a = 3;
   b = 2;
-  EXPECT_TRUE(KE::min(a, b) == 2);
+  EXPECT_EQ(Kokkos::min(a, b), 2);
 
-  STATIC_ASSERT(KE::min(3.f, 2.f) == 2.f);
-  STATIC_ASSERT(KE::min(3.f, 2.f, ::Test::Greater<int>{}) == 3.f);
+  STATIC_ASSERT(Kokkos::min(3.f, 2.f) == 2.f);
+  STATIC_ASSERT(Kokkos::min(3.f, 2.f, ::Test::Greater<int>{}) == 3.f);
 
-  EXPECT_TRUE(KE::min({3.f, -1.f, 0.f}) == -1.f);
+  EXPECT_EQ(Kokkos::min({3.f, -1.f, 0.f}), -1.f);
 
-  STATIC_ASSERT(KE::min({3, -1, 0}) == -1);
-  STATIC_ASSERT(KE::min({3, -1, 0}, ::Test::Greater<int>{}) == 3);
+  STATIC_ASSERT(Kokkos::min({3, -1, 0}) == -1);
+  STATIC_ASSERT(Kokkos::min({3, -1, 0}, ::Test::Greater<int>{}) == 3);
 
-  STATIC_ASSERT(KE::min({
-                            ::Test::PairIntCompareFirst{255, 0},
-                            ::Test::PairIntCompareFirst{255, 1},
-                            ::Test::PairIntCompareFirst{0, 2},
-                            ::Test::PairIntCompareFirst{0, 3},
-                            ::Test::PairIntCompareFirst{255, 4},
-                            ::Test::PairIntCompareFirst{0, 5},
-                        })
+  STATIC_ASSERT(Kokkos::min({
+                                ::Test::PairIntCompareFirst{255, 0},
+                                ::Test::PairIntCompareFirst{255, 1},
+                                ::Test::PairIntCompareFirst{0, 2},
+                                ::Test::PairIntCompareFirst{0, 3},
+                                ::Test::PairIntCompareFirst{255, 4},
+                                ::Test::PairIntCompareFirst{0, 5},
+                            })
                     .second == 2);  // leftmost element
 }
 
@@ -171,9 +166,8 @@ struct StdAlgoMinMaxOpsTestMin {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int& ind) const {
-    namespace KE = Kokkos::Experimental;
-    auto v1      = 10.;
-    if (KE::min(v1, m_view(ind)) == 0.) {
+    auto v1 = 10.;
+    if (Kokkos::min(v1, m_view(ind)) == 0.) {
       m_view(ind) = 8.;
     }
   }
@@ -199,49 +193,53 @@ TEST(TEST_CATEGORY, min_within_parfor) {
 // test minmax()
 // ----------------------------------------------------------
 TEST(TEST_CATEGORY, minmax) {
-  namespace KE  = Kokkos::Experimental;
   int a         = 1;
   int b         = 2;
-  const auto& r = KE::minmax(a, b);
-  EXPECT_TRUE(r.first == 1);
-  EXPECT_TRUE(r.second == 2);
+  const auto& r = Kokkos::minmax(a, b);
+  EXPECT_EQ(r.first, 1);
+  EXPECT_EQ(r.second, 2);
 
   a              = 3;
   b              = 2;
-  const auto& r2 = KE::minmax(a, b);
-  EXPECT_TRUE(r2.first == 2);
-  EXPECT_TRUE(r2.second == 3);
-
-  STATIC_ASSERT((Kokkos::pair<float, float>(KE::minmax(3.f, 2.f)) ==
+  const auto& r2 = Kokkos::minmax(a, b);
+  EXPECT_EQ(r2.first, 2);
+  EXPECT_EQ(r2.second, 3);
+
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC nvhpc can't deal with device side
+                               // constexpr constructors so I removed the
+                               // constexpr in pair, which makes STATIC_ASSERT
+                               // here fail
+  STATIC_ASSERT((Kokkos::pair<float, float>(Kokkos::minmax(3.f, 2.f)) ==
                  Kokkos::make_pair(2.f, 3.f)));
   STATIC_ASSERT(
-      (Kokkos::pair<float, float>(KE::minmax(
+      (Kokkos::pair<float, float>(Kokkos::minmax(
            3.f, 2.f, ::Test::Greater<int>{})) == Kokkos::make_pair(3.f, 2.f)));
 
-  EXPECT_TRUE(KE::minmax({3.f, -1.f, 0.f}) == Kokkos::make_pair(-1.f, 3.f));
+  EXPECT_EQ(Kokkos::minmax({3.f, -1.f, 0.f}), Kokkos::make_pair(-1.f, 3.f));
 
-  STATIC_ASSERT(KE::minmax({3, -1, 0}) == Kokkos::make_pair(-1, 3));
-  STATIC_ASSERT(KE::minmax({3, -1, 0}, ::Test::Greater<int>{}) ==
+  STATIC_ASSERT(Kokkos::minmax({3, -1, 0}) == Kokkos::make_pair(-1, 3));
+  STATIC_ASSERT(Kokkos::minmax({3, -1, 0}, ::Test::Greater<int>{}) ==
                 Kokkos::make_pair(3, -1));
 
-  STATIC_ASSERT(KE::minmax({
-                               ::Test::PairIntCompareFirst{255, 0},
-                               ::Test::PairIntCompareFirst{255, 1},
-                               ::Test::PairIntCompareFirst{0, 2},
-                               ::Test::PairIntCompareFirst{0, 3},
-                               ::Test::PairIntCompareFirst{255, 4},
-                               ::Test::PairIntCompareFirst{0, 5},
-                           })
+  STATIC_ASSERT(Kokkos::minmax({
+                                   ::Test::PairIntCompareFirst{255, 0},
+                                   ::Test::PairIntCompareFirst{255, 1},
+                                   ::Test::PairIntCompareFirst{0, 2},
+                                   ::Test::PairIntCompareFirst{0, 3},
+                                   ::Test::PairIntCompareFirst{255, 4},
+                                   ::Test::PairIntCompareFirst{0, 5},
+                               })
                     .first.second == 2);  // leftmost
-  STATIC_ASSERT(KE::minmax({
-                               ::Test::PairIntCompareFirst{255, 0},
-                               ::Test::PairIntCompareFirst{255, 1},
-                               ::Test::PairIntCompareFirst{0, 2},
-                               ::Test::PairIntCompareFirst{0, 3},
-                               ::Test::PairIntCompareFirst{255, 4},
-                               ::Test::PairIntCompareFirst{0, 5},
-                           })
+  STATIC_ASSERT(Kokkos::minmax({
+                                   ::Test::PairIntCompareFirst{255, 0},
+                                   ::Test::PairIntCompareFirst{255, 1},
+                                   ::Test::PairIntCompareFirst{0, 2},
+                                   ::Test::PairIntCompareFirst{0, 3},
+                                   ::Test::PairIntCompareFirst{255, 4},
+                                   ::Test::PairIntCompareFirst{0, 5},
+                               })
                     .second.second == 4);  // rightmost
+#endif
 }
 
 template <class ViewType>
@@ -250,9 +248,8 @@ struct StdAlgoMinMaxOpsTestMinMax {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int& ind) const {
-    namespace KE  = Kokkos::Experimental;
     auto v1       = 7.;
-    const auto& r = KE::minmax(v1, m_view(ind));
+    const auto& r = Kokkos::minmax(v1, m_view(ind));
     m_view(ind)   = (double)(r.first - r.second);
   }
 
@@ -261,7 +258,6 @@ struct StdAlgoMinMaxOpsTestMinMax {
 };
 
 TEST(TEST_CATEGORY, minmax_within_parfor) {
-  namespace KE = Kokkos::Experimental;
   using view_t = Kokkos::View<double*>;
   view_t a("a", 10);
 
@@ -277,28 +273,26 @@ TEST(TEST_CATEGORY, minmax_within_parfor) {
 // test clamp()
 // ----------------------------------------------------------
 TEST(TEST_CATEGORY, clamp) {
-  namespace KE = Kokkos::Experimental;
-
   int a         = 1;
   int b         = 2;
   int c         = 19;
-  const auto& r = KE::clamp(a, b, c);
-  EXPECT_TRUE(&r == &b);
-  EXPECT_TRUE(r == b);
+  const auto& r = Kokkos::clamp(a, b, c);
+  EXPECT_EQ(&r, &b);
+  EXPECT_EQ(r, b);
 
   a              = 5;
   b              = -2;
   c              = 3;
-  const auto& r2 = KE::clamp(a, b, c);
-  EXPECT_TRUE(&r2 == &c);
-  EXPECT_TRUE(r2 == c);
+  const auto& r2 = Kokkos::clamp(a, b, c);
+  EXPECT_EQ(&r2, &c);
+  EXPECT_EQ(r2, c);
 
   a              = 5;
   b              = -2;
   c              = 7;
-  const auto& r3 = KE::clamp(a, b, c);
-  EXPECT_TRUE(&r3 == &a);
-  EXPECT_TRUE(r3 == a);
+  const auto& r3 = Kokkos::clamp(a, b, c);
+  EXPECT_EQ(&r3, &a);
+  EXPECT_EQ(r3, a);
 }
 
 template <class ViewType>
@@ -307,11 +301,10 @@ struct StdAlgoMinMaxOpsTestClamp {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int& ind) const {
-    namespace KE  = Kokkos::Experimental;
     m_view(ind)   = 10.;
     const auto b  = -2.;
     const auto c  = 3.;
-    const auto& r = KE::clamp(m_view(ind), b, c);
+    const auto& r = Kokkos::clamp(m_view(ind), b, c);
     m_view(ind)   = (double)(r);
   }
 
@@ -320,7 +313,6 @@ struct StdAlgoMinMaxOpsTestClamp {
 };
 
 TEST(TEST_CATEGORY, clamp_within_parfor) {
-  namespace KE = Kokkos::Experimental;
   using view_t = Kokkos::View<double*>;
   view_t a("a", 10);
 
diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index d7607c4f7..02064d2fc 100644
--- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -82,37 +82,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  my_complex &operator=(const volatile my_complex &src) {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  volatile my_complex &operator=(const my_complex &src) volatile {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  volatile my_complex &operator=(const volatile my_complex &src) volatile {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  my_complex(const volatile my_complex &src) {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex(const double &val) {
     re    = val;
@@ -128,13 +97,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile my_complex &src) volatile {
-    re += src.re;
-    im += src.im;
-    dummy += src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex operator+(const my_complex &src) {
     my_complex tmp = *this;
@@ -144,15 +106,6 @@ struct my_complex {
     return tmp;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  my_complex operator+(const volatile my_complex &src) volatile {
-    my_complex tmp = *this;
-    tmp.re += src.re;
-    tmp.im += src.im;
-    tmp.dummy += src.dummy;
-    return tmp;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex &operator*=(const my_complex &src) {
     double re_tmp = re * src.re - im * src.im;
@@ -163,15 +116,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator*=(const volatile my_complex &src) volatile {
-    double re_tmp = re * src.re - im * src.im;
-    double im_tmp = re * src.im + im * src.re;
-    re            = re_tmp;
-    im            = im_tmp;
-    dummy *= src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   bool operator==(const my_complex &src) const {
     return (re == src.re) && (im == src.im) && (dummy == src.dummy);
@@ -229,12 +173,6 @@ struct array_reduce {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  array_reduce &operator=(const volatile array_reduce &src) {
-    for (int i = 0; i < N; i++) data[i] = src.data[i];
-    return *this;
-  }
-
   KOKKOS_INLINE_FUNCTION  // add operator
       array_reduce &
       operator=(const scalar_t val) {
@@ -254,11 +192,6 @@ struct array_reduce {
     for (int i = 0; i < N; i++) data[i] += src.data[i];
     return *this;
   }
-  KOKKOS_INLINE_FUNCTION  // volatile add operator
-      void
-      operator+=(const volatile array_reduce &src) volatile {
-    for (int i = 0; i < N; i++) data[i] += src.data[i];
-  }
   KOKKOS_INLINE_FUNCTION  // add operator
       array_reduce
       operator+(const array_reduce &src) const {
@@ -279,11 +212,6 @@ struct array_reduce {
     for (int i = 0; i < N; i++) data[i] *= src.data[i];
     return *this;
   }
-  KOKKOS_INLINE_FUNCTION  // volatile add operator
-      void
-      operator*=(const volatile array_reduce &src) volatile {
-    for (int i = 0; i < N; i++) data[i] *= src.data[i];
-  }
   KOKKOS_INLINE_FUNCTION  // add operator
       array_reduce
       operator*(const array_reduce &src) const {
@@ -320,9 +248,6 @@ struct point_t {
   KOKKOS_FUNCTION
   point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
 
-  KOKKOS_FUNCTION
-  point_t(const volatile point_t &val) : x(val.x), y(val.y), z(val.z){};
-
   KOKKOS_FUNCTION
   point_t(const int rhs) { x = y = z = static_cast<uint8_t>(rhs); }
 
@@ -330,19 +255,19 @@ struct point_t {
   explicit operator int() const { return static_cast<int>(x + y + z); }
 
   KOKKOS_FUNCTION
-  bool operator==(const volatile point_t rhs) const volatile {
+  bool operator==(const point_t rhs) const {
     return (x == rhs.x && y == rhs.y && z == rhs.z);
   }
 
   KOKKOS_FUNCTION
-  void operator=(point_t rhs) volatile {
+  void operator=(point_t rhs) {
     x = rhs.x;
     y = rhs.y;
     z = rhs.z;
   }
 
   KOKKOS_FUNCTION
-  volatile point_t operator+=(const volatile point_t rhs) volatile {
+  point_t operator+=(const point_t rhs) {
     x += rhs.x;
     y += rhs.y;
     z += rhs.z;
diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
index 52989aa5d..0f34ff436 100644
--- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -48,7 +48,6 @@
 #include <type_traits>
 #include <limits>
 #include "Kokkos_NumericTraits.hpp"
-#include "Kokkos_ExecPolicy.hpp"
 
 struct extrema {
 #define DEFINE_EXTREMA(T, m, M)                 \
@@ -213,8 +212,8 @@ struct TestNumericTraits {
   }
 };
 
-#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_SYCL) || \
-    defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if (defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_ENABLE_CUDA)) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class Tag>
 struct TestNumericTraits<
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -237,15 +236,12 @@ struct TestNumericTraits<
 };
 #endif
 
-#ifdef KOKKOS_COMPILER_NVHPC
-// warning: 'long double' is treated as 'double' in device code
-#pragma diag_suppress 20208
-#endif
-
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
-#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1 see issue #4100
+  // fails with XL 16.1.1 see issue #4100
+  // FIXME_NVHPC long double not supported
+#if !defined(KOKKOS_COMPILER_IBM) && !defined(KOKKOS_COMPILER_NVHPC)
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
 #endif
 }
@@ -253,7 +249,9 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) {
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
   TestNumericTraits<TEST_EXECSPACE, float, Epsilon>();
   TestNumericTraits<TEST_EXECSPACE, double, Epsilon>();
-#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+  // fails with XL 16.1.1 see issue #4100
+  // FIXME_NVHPC long double not supported
+#if !defined(KOKKOS_COMPILER_IBM) && !defined(KOKKOS_COMPILER_NVHPC)
   TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>();
 #endif
 }
@@ -261,25 +259,41 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) {
 TEST(TEST_CATEGORY, numeric_traits_round_error) {
   TestNumericTraits<TEST_EXECSPACE, float, RoundError>();
   TestNumericTraits<TEST_EXECSPACE, double, RoundError>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, RoundError>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_norm_min) {
   TestNumericTraits<TEST_EXECSPACE, float, NormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, NormMin>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, NormMin>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_denorm_min) {
   TestNumericTraits<TEST_EXECSPACE, float, DenormMin>();
   TestNumericTraits<TEST_EXECSPACE, double, DenormMin>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, DenormMin>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_reciprocal_overflow_threshold) {
   TestNumericTraits<TEST_EXECSPACE, float, ReciprocalOverflowThreshold>();
   TestNumericTraits<TEST_EXECSPACE, double, ReciprocalOverflowThreshold>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, ReciprocalOverflowThreshold>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_finite_min_max) {
@@ -314,8 +328,12 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) {
   TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>();
   TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_digits) {
@@ -333,7 +351,11 @@ TEST(TEST_CATEGORY, numeric_traits_digits) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, Digits>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_digits10) {
@@ -351,15 +373,22 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, float, Digits10>();
   TestNumericTraits<TEST_EXECSPACE, double, Digits10>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, Digits10>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_max_digits10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>();
+#endif
 }
-
 TEST(TEST_CATEGORY, numeric_traits_radix) {
   TestNumericTraits<TEST_EXECSPACE, bool, Radix>();
   TestNumericTraits<TEST_EXECSPACE, char, Radix>();
@@ -375,7 +404,11 @@ TEST(TEST_CATEGORY, numeric_traits_radix) {
   TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Radix>();
   TestNumericTraits<TEST_EXECSPACE, float, Radix>();
   TestNumericTraits<TEST_EXECSPACE, double, Radix>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, Radix>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) {
@@ -383,8 +416,12 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
@@ -392,17 +429,27 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
+#endif
 }
-
 TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) {
   TestNumericTraits<TEST_EXECSPACE, float, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, float, SignalingNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, double, SignalingNaN>();
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+  // Unsupported unknown data type 38.
+  // Unsupported unknown data type 38.
+  // Unsupported unknown data type 38.
+  // nvc++-Fatal-/home/projects/x86-64/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/tools/cpp2
+  // TERMINATED by signal 11
   TestNumericTraits<TEST_EXECSPACE, long double, QuietNaN>();
   TestNumericTraits<TEST_EXECSPACE, long double, SignalingNaN>();
+#endif
 }
 
 namespace NumericTraitsSFINAE {
@@ -604,7 +651,10 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
 
 // Workaround compiler issue error: expression must have a constant value
 // See kokkos/kokkos#4574
-#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
+// There is the same bug with CUDA 11.6
+// FIXME_NVHPC FIXME_CUDA FIXME_NVCC
+#if !defined(KOKKOS_COMPILER_NVHPC) && (CUDA_VERSION < 11060) && \
+    !(defined(KOKKOS_COMPILER_NVCC) && !defined(KOKKOS_ENABLE_CUDA))
 CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, quiet_NaN);
 CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, quiet_NaN);
 CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, quiet_NaN);
diff --git a/packages/kokkos/core/unit_test/TestOther.hpp b/packages/kokkos/core/unit_test/TestOther.hpp
index c068d250c..5596f373b 100644
--- a/packages/kokkos/core/unit_test/TestOther.hpp
+++ b/packages/kokkos/core/unit_test/TestOther.hpp
@@ -44,9 +44,13 @@
 
 #ifndef KOKKOS_TEST_OTHER_HPP
 #define KOKKOS_TEST_OTHER_HPP
-#include <TestTemplateMetaFunctions.hpp>
 #include <TestAggregate.hpp>
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC:
+// NVC++-F-0000-Internal compiler error. Basic LLVM base data type required 23
+// (/ascldap/users/crtrott/Kokkos/kokkos/build/core/unit_test/cuda/TestCuda_Other.cpp:
+// 204) NVC++/x86-64 Linux 22.3-0: compilation aborted
 #include <TestMemoryPool.hpp>
+#endif
 #include <TestCXX11.hpp>
 
 #include <TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
new file mode 100644
index 000000000..11684f8a8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp
@@ -0,0 +1,499 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <impl/Kokkos_ParseCommandLineArgumentsAndEnvironmentVariables.hpp>
+#include <impl/Kokkos_InitializationSettings.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
+#include <impl/Kokkos_Command_Line_Parsing.hpp>
+
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+#include <regex>
+#include <string>
+#include <unordered_map>
+
+namespace {
+
+class EnvVarsHelper {
+  // do not let GTest run unit tests that set the environment concurrently
+  static std::mutex mutex_;
+  std::vector<std::string> vars_;
+  // FIXME_CXX17 prefer optional
+  // store name of env var that was already set (if any)
+  // in which case unit test is skipped
+  std::unique_ptr<std::string> skip_;
+
+  void setup(std::unordered_map<std::string, std::string> const& vars) {
+    for (auto const& x : vars) {
+      auto const& name  = x.first;
+      auto const& value = x.second;
+      // skip unit test if env var is already set
+      if (getenv(name.c_str())) {
+        skip_ = std::make_unique<std::string>(name);
+        break;
+      }
+#ifdef _WIN32
+      int const error_code = _putenv((name + "=" + value).c_str());
+#else
+      int const error_code =
+          setenv(name.c_str(), value.c_str(), /*overwrite=*/0);
+#endif
+      if (error_code != 0) {
+        std::cerr << "failed to set environment variable '" << name << "="
+                  << value << "'\n";
+        std::abort();
+      }
+      vars_.push_back(name);
+    }
+  }
+  void teardown() {
+    for (auto const& name : vars_) {
+#ifdef _WIN32
+      int const error_code = _putenv((name + "=").c_str());
+#else
+      int const error_code = unsetenv(name.c_str());
+#endif
+      if (error_code != 0) {
+        std::cerr << "failed to unset environment variable '" << name << "'\n";
+        std::abort();
+      }
+    }
+  }
+
+ public:
+  auto& skip() { return skip_; }
+  EnvVarsHelper(std::unordered_map<std::string, std::string> const& vars) {
+    mutex_.lock();
+    setup(vars);
+  }
+  EnvVarsHelper& operator=(
+      std::unordered_map<std::string, std::string> const& vars) {
+    teardown();
+    setup(vars);
+    return *this;
+  }
+  ~EnvVarsHelper() {
+    teardown();
+    mutex_.unlock();
+  }
+  EnvVarsHelper(EnvVarsHelper&) = delete;
+  EnvVarsHelper& operator=(EnvVarsHelper&) = delete;
+  friend std::ostream& operator<<(std::ostream& os, EnvVarsHelper const& ev) {
+    for (auto const& name : ev.vars_) {
+      os << name << '=' << std::getenv(name.c_str()) << '\n';
+    }
+    return os;
+  }
+};
+std::mutex EnvVarsHelper::mutex_;
+#define SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev)       \
+  if (ev.skip()) {                                         \
+    GTEST_SKIP() << "environment variable '" << *ev.skip() \
+                 << "' is already set";                    \
+  }                                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+
+class CmdLineArgsHelper {
+  int argc_;
+  std::vector<char*> argv_;
+  std::vector<std::unique_ptr<char[]>> args_;
+
+ public:
+  CmdLineArgsHelper(std::vector<std::string> const& args) : argc_(args.size()) {
+    for (auto const& x : args) {
+      args_.emplace_back(new char[x.size() + 1]);
+      char* ptr = args_.back().get();
+      strcpy(ptr, x.c_str());
+      argv_.push_back(ptr);
+    }
+    argv_.push_back(nullptr);
+  }
+  int& argc() { return argc_; }
+  char** argv() { return argv_.data(); }
+};
+#define EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, ...) \
+  do {                                                    \
+    std::vector<std::string> expected_argv = __VA_ARGS__; \
+                                                          \
+    int expected_argc = expected_argv.size();             \
+    EXPECT_EQ(cla.argc(), expected_argc);                 \
+    for (int i = 0; i < expected_argc; ++i) {             \
+      EXPECT_EQ(cla.argv()[i], expected_argv[i])          \
+          << "arguments differ at index " << i;           \
+    }                                                     \
+    EXPECT_EQ(cla.argv()[cla.argc()], nullptr);           \
+  } while (false)
+
+TEST(defaultdevicetype, cmd_line_args_num_threads) {
+  CmdLineArgsHelper cla = {{
+      "--foo=bar",
+      "--kokkos-num-threads=1",
+      "--kokkos-num-threads=2",
+  }};
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_num_threads());
+  EXPECT_EQ(settings.get_num_threads(), 2);
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--foo=bar"});
+}
+
+TEST(defaultdevicetype, cmd_line_args_device_id) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos-device-id=3",
+      "--dummy",
+      "--kokkos-device-id=4",
+  }};
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_device_id());
+  EXPECT_EQ(settings.get_device_id(), 4);
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"});
+}
+
+TEST(defaultdevicetype, cmd_line_args_num_devices) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos-num-devices=5,6",
+      "--kokkos-num-devices=7",
+      "-v",
+  }};
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_num_devices());
+  EXPECT_EQ(settings.get_num_devices(), 7);
+  // this is the current behavior, not suggesting this cannot be revisited
+  EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment";
+  EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment";
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"});
+}
+
+TEST(defaultdevicetype, cmd_line_args_disable_warning) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos-disable-warnings=1",
+      "--kokkos-disable-warnings=false",
+  }};
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_disable_warnings());
+  EXPECT_FALSE(settings.get_disable_warnings());
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {});
+}
+
+TEST(defaultdevicetype, cmd_line_args_tune_internals) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos-tune-internals",
+      "--kokkos-num-threads=3",
+  }};
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_tune_internals());
+  EXPECT_TRUE(settings.get_tune_internals());
+  EXPECT_TRUE(settings.has_num_threads());
+  EXPECT_EQ(settings.get_num_threads(), 3);
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {});
+}
+
+TEST(defaultdevicetype, cmd_line_args_help) {
+  CmdLineArgsHelper cla = {{
+      "--help",
+  }};
+  Kokkos::InitializationSettings settings;
+  ::testing::internal::CaptureStdout();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  auto captured = ::testing::internal::GetCapturedStdout();
+  // check that error message was only printed once
+  EXPECT_EQ(captured.find("--kokkos-help"), captured.rfind("--kokkos-help"));
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--help"});
+  auto const help_message_length = captured.length();
+
+  cla = {{
+      {"--kokkos-help"},
+  }};
+  ::testing::internal::CaptureStdout();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  captured = ::testing::internal::GetCapturedStdout();
+  EXPECT_EQ(captured.length(), help_message_length);
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {});
+
+  cla = {{
+      {"--kokkos-help"},
+      {"--help"},
+      {"--kokkos-help"},
+  }};
+  ::testing::internal::CaptureStdout();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  captured = ::testing::internal::GetCapturedStdout();
+  EXPECT_EQ(captured.length(), help_message_length);
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--help"});
+}
+
+TEST(defaultdevicetype, cmd_line_args_tools_arguments) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos-tool-libs=ich_tue_nur.so",
+  }};
+  Kokkos::InitializationSettings settings;
+  ::testing::internal::CaptureStderr();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  auto captured = ::testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(captured.find("not recognized") != std::string::npos &&
+              captured.find("--kokkos-tool-libs=ich_tue_nur.so") !=
+                  std::string::npos &&
+              !settings.has_tools_libs())
+      << captured;
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(
+      cla, {"--kokkos-tool-libs=ich_tue_nur.so"});
+
+  cla      = {{
+      "--kokkos-tools-libs=ich_tue_nur.so",
+  }};
+  settings = {};
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  EXPECT_TRUE(settings.has_tools_libs());
+  EXPECT_EQ(settings.get_tools_libs(), "ich_tue_nur.so");
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {});
+}
+
+TEST(defaultdevicetype, cmd_line_args_unrecognized_flag) {
+  CmdLineArgsHelper cla = {{
+      "--kokkos_num_threads=4",  // underscores instead of dashes
+  }};
+  Kokkos::InitializationSettings settings;
+  ::testing::internal::CaptureStderr();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  auto captured = ::testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(captured.find("not recognized") != std::string::npos &&
+              captured.find("--kokkos_num_threads=4") != std::string::npos &&
+              !settings.has_num_threads())
+      << captured;
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--kokkos_num_threads=4"});
+
+  cla = {{
+      "-kokkos-num-threads=4",  // missing one leading dash
+  }};
+  ::testing::internal::CaptureStderr();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  captured = ::testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(captured.find("not recognized") != std::string::npos &&
+              captured.find("-kokkos-num-threads=4") != std::string::npos &&
+              !settings.has_num_threads())
+      << captured;
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-kokkos-num-threads=4"});
+
+  cla = {{
+      "--kokko-num-threads=4",  // no warning when prefix misspelled
+  }};
+  ::testing::internal::CaptureStderr();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  captured = ::testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(captured.empty() && !settings.has_num_threads()) << captured;
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--kokko-num-threads=4"});
+
+  Kokkos::Impl::do_not_warn_not_recognized_command_line_argument(
+      std::regex{"^--kokkos-extension.*"});
+  cla = {{
+      "--kokkos-extension-option=value",  // user explicitly asked not to warn
+                                          // about that prefix
+  }};
+  ::testing::internal::CaptureStderr();
+  Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings);
+  captured = ::testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(captured.empty()) << captured;
+  EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla,
+                                          {"--kokkos-extension-option=value"});
+}
+
+TEST(defaultdevicetype, env_vars_num_threads) {
+  EnvVarsHelper ev = {{
+      {"KOKKOS_NUM_THREADS", "24"},
+      {"KOKKOS_DISABLE_WARNINGS", "1"},
+  }};
+  SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_environment_variables(settings);
+  EXPECT_TRUE(settings.has_num_threads());
+  EXPECT_EQ(settings.get_num_threads(), 24);
+  EXPECT_TRUE(settings.has_disable_warnings());
+  EXPECT_TRUE(settings.get_disable_warnings());
+
+  ev = {{
+      {"KOKKOS_NUM_THREADS", "1ABC"},
+  }};
+  SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+  settings = {};
+  Kokkos::Impl::parse_environment_variables(settings);
+  EXPECT_TRUE(settings.has_num_threads());
+  EXPECT_EQ(settings.get_num_threads(), 1);
+}
+
+TEST(defaultdevicetype, env_vars_device_id) {
+  EnvVarsHelper ev = {{
+      {"KOKKOS_DEVICE_ID", "33"},
+  }};
+  SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_environment_variables(settings);
+  EXPECT_TRUE(settings.has_device_id());
+  EXPECT_EQ(settings.get_device_id(), 33);
+}
+
+TEST(defaultdevicetype, env_vars_num_devices) {
+  EnvVarsHelper ev = {{
+      {"KOKKOS_NUM_DEVICES", "4"},
+      {"KOKKOS_SKIP_DEVICE", "1"},
+  }};
+  SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+  Kokkos::InitializationSettings settings;
+  Kokkos::Impl::parse_environment_variables(settings);
+  EXPECT_TRUE(settings.has_num_devices());
+  EXPECT_EQ(settings.get_num_devices(), 4);
+  EXPECT_TRUE(settings.has_skip_device());
+  EXPECT_EQ(settings.get_skip_device(), 1);
+}
+
+TEST(defaultdevicetype, env_vars_disable_warnings) {
+  for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) {
+    EnvVarsHelper ev = {{
+        {"KOKKOS_DISABLE_WARNINGS", value_true},
+    }};
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+    Kokkos::InitializationSettings settings;
+    Kokkos::Impl::parse_environment_variables(settings);
+    EXPECT_TRUE(settings.has_disable_warnings())
+        << "KOKKOS_DISABLE_WARNINGS=" << value_true;
+    EXPECT_TRUE(settings.get_disable_warnings())
+        << "KOKKOS_DISABLE_WARNINGS=" << value_true;
+  }
+  for (auto const& value_false : {"0", "fAlse", "No"}) {
+    EnvVarsHelper ev = {{
+        {"KOKKOS_DISABLE_WARNINGS", value_false},
+    }};
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+    Kokkos::InitializationSettings settings;
+    Kokkos::Impl::parse_environment_variables(settings);
+    EXPECT_TRUE(settings.has_disable_warnings())
+        << "KOKKOS_DISABLE_WARNINGS=" << value_false;
+    EXPECT_FALSE(settings.get_disable_warnings())
+        << "KOKKOS_DISABLE_WARNINGS=" << value_false;
+  }
+}
+
+TEST(defaultdevicetype, env_vars_tune_internals) {
+  for (auto const& value_true : {"1", "yES", "true", "TRUE", "tRuE"}) {
+    EnvVarsHelper ev = {{
+        {"KOKKOS_TUNE_INTERNALS", value_true},
+    }};
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+    Kokkos::InitializationSettings settings;
+    Kokkos::Impl::parse_environment_variables(settings);
+    EXPECT_TRUE(settings.has_tune_internals())
+        << "KOKKOS_TUNE_INTERNALS=" << value_true;
+    EXPECT_TRUE(settings.get_tune_internals())
+        << "KOKKOS_TUNE_INTERNALS=" << value_true;
+  }
+  for (auto const& value_false : {"0", "false", "no"}) {
+    EnvVarsHelper ev = {{
+        {"KOKKOS_TUNE_INTERNALS", value_false},
+    }};
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);
+    Kokkos::InitializationSettings settings;
+    Kokkos::Impl::parse_environment_variables(settings);
+    EXPECT_TRUE(settings.has_tune_internals())
+        << "KOKKOS_TUNE_INTERNALS=" << value_false;
+    EXPECT_FALSE(settings.get_tune_internals())
+        << "KOKKOS_TUNE_INTERNALS=" << value_false;
+  }
+}
+
+TEST(defaultdevicetype, visible_devices) {
+#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV)                    \
+  do {                                                                \
+    EnvVarsHelper ev{ENV};                                            \
+    SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev);                     \
+    Kokkos::InitializationSettings settings;                          \
+    Kokkos::Impl::parse_environment_variables(settings);              \
+    auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \
+    std::vector<int> expected = DEV;                                  \
+    EXPECT_EQ(expected.size(), computed.size())                       \
+        << ev << "device count: " << CNT;                             \
+    auto n = std::min<int>(expected.size(), computed.size());         \
+    for (int i = 0; i < n; ++i) {                                     \
+      EXPECT_EQ(expected[i], computed[i])                             \
+          << "devices differ at index " << i << '\n'                  \
+          << ev << "device count: " << CNT;                           \
+    }                                                                 \
+  } while (false)
+
+#define DEV(...) \
+  std::vector<int> { __VA_ARGS__ }
+#define ENV(...) std::unordered_map<std::string, std::string>{__VA_ARGS__}
+
+  // first test with all environment variables that are involved in determining
+  // the visible devices so user set var do not mess up the logic below.
+  KOKKOS_TEST_VISIBLE_DEVICES(
+      ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"},
+          {"KOKKOS_SKIP_DEVICE", "1"}),
+      6, DEV(2, 1));
+  KOKKOS_TEST_VISIBLE_DEVICES(
+      ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6,
+      DEV(2, 1));
+  KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6,
+                              DEV(0, 1, 2));
+  KOKKOS_TEST_VISIBLE_DEVICES(
+      ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6,
+      DEV(0, 2, 3));
+  KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6,
+                              DEV(1, 3, 4));
+  KOKKOS_TEST_VISIBLE_DEVICES(
+      ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6,
+      DEV(2, 1));
+  KOKKOS_TEST_VISIBLE_DEVICES(ENV(), 4, DEV(0, 1, 2, 3));
+
+#undef ENV
+#undef DEV
+#undef KOKKOS_TEST_VISIBLE_DEVICES
+}
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 26eb22670..b25acb455 100644
--- a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <type_traits>
@@ -76,7 +75,7 @@ class TestRangePolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Static>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -91,7 +90,7 @@ class TestRangePolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Static>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -107,7 +106,7 @@ class TestRangePolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -123,7 +122,7 @@ class TestRangePolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -139,7 +138,7 @@ class TestRangePolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -203,7 +202,7 @@ class TestRangePolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -219,7 +218,7 @@ class TestRangePolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -235,7 +234,7 @@ class TestRangePolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -351,7 +350,7 @@ class TestTeamPolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Static>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -366,7 +365,7 @@ class TestTeamPolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Static>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -382,7 +381,7 @@ class TestTeamPolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -398,7 +397,7 @@ class TestTeamPolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -414,7 +413,7 @@ class TestTeamPolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -478,7 +477,7 @@ class TestTeamPolicyConstruction {
                                 typename execution_space::size_type>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -494,7 +493,7 @@ class TestTeamPolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
@@ -510,7 +509,7 @@ class TestTeamPolicyConstruction {
       ASSERT_TRUE((std::is_same<index_type, long>::value));
       ASSERT_TRUE((std::is_same<schedule_type,
                                 Kokkos::Schedule<Kokkos::Dynamic>>::value));
-      ASSERT_TRUE((std::is_same<work_tag, void>::value));
+      ASSERT_TRUE((std::is_void<work_tag>::value));
     }
 
     {
diff --git a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
index e45d84e7e..3be6b70ec 100644
--- a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
+++ b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -50,6 +50,8 @@
 
 #include <gtest/gtest.h>
 
+namespace {
+
 // FIXME instantiate only once for default host execution space
 TEST(TEST_CATEGORY, quad_precision_reductions) {
   int const n = 100;
@@ -98,12 +100,52 @@ TEST(TEST_CATEGORY, quad_precision_common_math_functions) {
   Kokkos::parallel_for(
       Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1),
       KOKKOS_LAMBDA(int) {
-        (void)Kokkos::Experimental::fabs((__float128)0);
-        (void)Kokkos::Experimental::sqrt((__float128)1);
-        (void)Kokkos::Experimental::exp((__float128)2);
-        (void)Kokkos::Experimental::sin((__float128)3);
-        (void)Kokkos::Experimental::cosh((__float128)4);
+        (void)Kokkos::fabs((__float128)0);
+        (void)Kokkos::sqrt((__float128)1);
+        (void)Kokkos::exp((__float128)2);
+        (void)Kokkos::sin((__float128)3);
+        (void)Kokkos::cosh((__float128)4);
       });
 }
 
+#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, "")  // FIXME C++17
+
+constexpr bool test_quad_precision_promotion_traits() {
+  STATIC_ASSERT(
+      std::is_same<__float128, decltype(Kokkos::pow(__float128(1), 2))>::value);
+  STATIC_ASSERT(std::is_same<__float128,
+                             decltype(Kokkos::hypot(3, __float128(4)))>::value);
+  return true;
+}
+
+STATIC_ASSERT(test_quad_precision_promotion_traits());
+
+constexpr bool test_quad_precision_math_constants() {
+  // compare to mathematical constants defined in libquadmath when available
+  // clang-format off
+  STATIC_ASSERT(Kokkos::Experimental::e_v     <__float128> == M_Eq);
+  STATIC_ASSERT(Kokkos::Experimental::log2e_v <__float128> == M_LOG2Eq);
+  STATIC_ASSERT(Kokkos::Experimental::log10e_v<__float128> == M_LOG10Eq);
+  STATIC_ASSERT(Kokkos::Experimental::pi_v    <__float128> == M_PIq);
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 930)
+  STATIC_ASSERT(Kokkos::Experimental::inv_pi_v<__float128> == M_1_PIq);
+#endif
+  // inv_sqrtpi_v
+  STATIC_ASSERT(Kokkos::Experimental::ln2_v   <__float128> == M_LN2q);
+  STATIC_ASSERT(Kokkos::Experimental::ln10_v  <__float128> == M_LN10q);
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 930)
+  STATIC_ASSERT(Kokkos::Experimental::sqrt2_v <__float128> == M_SQRT2q);
+#endif
+  // sqrt3_v
+  // inv_sqrt3_v
+  // egamma_v
+  // phi_v
+  // clang-format on
+  return true;
+}
+
+STATIC_ASSERT(test_quad_precision_math_constants());
+
+}  // namespace
+
 #endif
diff --git a/packages/kokkos/core/unit_test/TestRealloc.hpp b/packages/kokkos/core/unit_test/TestRealloc.hpp
index 2b3e1ac37..3de42070e 100644
--- a/packages/kokkos/core/unit_test/TestRealloc.hpp
+++ b/packages/kokkos/core/unit_test/TestRealloc.hpp
@@ -71,81 +71,105 @@ void impl_testRealloc() {
     using view_type = Kokkos::View<int*, DeviceType>;
     view_type view_1d("view_1d", sizes[0]);
     const int* oldPointer = view_1d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    auto const& oldLabel  = view_1d.label();
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_1d, sizes[0]);
+    auto const& newLabel = view_1d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_1d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int**, DeviceType>;
     view_type view_2d("view_2d", sizes[0], sizes[1]);
+    auto const& oldLabel  = view_2d.label();
     const int* oldPointer = view_2d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_2d, sizes[0], sizes[1]);
+    auto const& newLabel = view_2d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_2d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int***, DeviceType>;
     view_type view_3d("view_3d", sizes[0], sizes[1], sizes[2]);
+    auto const& oldLabel  = view_3d.label();
     const int* oldPointer = view_3d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_3d, sizes[0], sizes[1], sizes[2]);
+    auto const& newLabel = view_3d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_3d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int****, DeviceType>;
     view_type view_4d("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]);
+    auto const& oldLabel  = view_4d.label();
     const int* oldPointer = view_4d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_4d, sizes[0], sizes[1], sizes[2], sizes[3]);
+    auto const& newLabel = view_4d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_4d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int*****, DeviceType>;
     view_type view_5d("view_5d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4]);
+    auto const& oldLabel  = view_5d.label();
     const int* oldPointer = view_5d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_5d, sizes[0], sizes[1], sizes[2], sizes[3],
                      sizes[4]);
+    auto const& newLabel = view_5d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_5d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int******, DeviceType>;
     view_type view_6d("view_6d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5]);
     const int* oldPointer = view_6d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    auto const& oldLabel  = view_6d.label();
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_6d, sizes[0], sizes[1], sizes[2], sizes[3],
                      sizes[4], sizes[5]);
+    auto const& newLabel = view_6d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_6d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int*******, DeviceType>;
     view_type view_7d("view_7d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5], sizes[6]);
+    auto const& oldLabel  = view_7d.label();
     const int* oldPointer = view_7d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_7d, sizes[0], sizes[1], sizes[2], sizes[3],
                      sizes[4], sizes[5], sizes[6]);
+    auto const& newLabel = view_7d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_7d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int********, DeviceType>;
     view_type view_8d("view_8d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5], sizes[6], sizes[7]);
+    auto const& oldLabel  = view_8d.label();
     const int* oldPointer = view_8d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     realloc_dispatch(Tag{}, view_8d, sizes[0], sizes[1], sizes[2], sizes[3],
                      sizes[4], sizes[5], sizes[6], sizes[7]);
+    auto const& newLabel = view_8d.label();
+    EXPECT_EQ(oldLabel, newLabel);
     const int* newPointer = view_8d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
 }
 
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
index 161b21615..c136b409b 100644
--- a/packages/kokkos/core/unit_test/TestReduce.hpp
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <limits>
@@ -82,7 +81,7 @@ class ReduceFunctor {
   */
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dst, const volatile value_type& src) const {
+  void join(value_type& dst, const value_type& src) const {
     dst.value[0] += src.value[0];
     dst.value[1] += src.value[1];
     dst.value[2] += src.value[2];
@@ -129,8 +128,7 @@ class ReduceFunctorFinalTag {
   ReduceFunctorFinalTag(const size_type arg_nwork) : nwork(arg_nwork) {}
 
   KOKKOS_INLINE_FUNCTION
-  void join(const ReducerTag, volatile value_type& dst,
-            const volatile value_type& src) const {
+  void join(const ReducerTag, value_type& dst, const value_type& src) const {
     dst.value[0] += src.value[0];
     dst.value[1] += src.value[1];
     dst.value[2] += src.value[2];
@@ -174,7 +172,7 @@ class RuntimeReduceFunctor {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile ScalarType dst[], const volatile ScalarType src[]) const {
+  void join(ScalarType dst[], const ScalarType src[]) const {
     for (unsigned i = 0; i < value_count; ++i) dst[i] += src[i];
   }
 
@@ -218,7 +216,7 @@ class RuntimeReduceMinMax {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile ScalarType dst[], const volatile ScalarType src[]) const {
+  void join(ScalarType dst[], const ScalarType src[]) const {
     for (unsigned i = 0; i < value_count; ++i) {
       dst[i] = i % 2 ? (dst[i] < src[i] ? dst[i] : src[i])   // min
                      : (dst[i] > src[i] ? dst[i] : src[i]);  // max
@@ -634,21 +632,35 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   constexpr uint64_t nw = 1000;
 
   uint64_t nsum = (nw / 2) * (nw + 1);
-
-  auto result1_v = Kokkos::View<int64_t, Kokkos::HostSpace>{"result1_v"};
-
-  int64_t result2 = 0;
-
-  auto result3_v = Kokkos::View<int64_t, Kokkos::HostSpace>{"result3_v"};
-
-  Kokkos::parallel_reduce("int_combined-reduce_mixed",
-                          Kokkos::RangePolicy<TEST_EXECSPACE>(0, nw),
-                          functor_type(nw), result1_v, result2,
-                          Kokkos::Sum<int64_t, Kokkos::HostSpace>{result3_v});
-
-  ASSERT_EQ(int64_t(nw), result1_v());
-  ASSERT_EQ(int64_t(nsum), result2);
-  ASSERT_EQ(int64_t(nsum), result3_v());
+  {
+    auto result1_v  = Kokkos::View<int64_t, Kokkos::HostSpace>{"result1_v"};
+    int64_t result2 = 0;
+    auto result3_v  = Kokkos::View<int64_t, Kokkos::HostSpace>{"result3_v"};
+    Kokkos::parallel_reduce("int_combined-reduce_mixed",
+                            Kokkos::RangePolicy<TEST_EXECSPACE>(0, nw),
+                            functor_type(nw), result1_v, result2,
+                            Kokkos::Sum<int64_t, Kokkos::HostSpace>{result3_v});
+    ASSERT_EQ(int64_t(nw), result1_v());
+    ASSERT_EQ(int64_t(nsum), result2);
+    ASSERT_EQ(int64_t(nsum), result3_v());
+  }
+  {
+    using MemorySpace = typename TEST_EXECSPACE::memory_space;
+    auto result1_v    = Kokkos::View<int64_t, MemorySpace>{"result1_v"};
+    int64_t result2   = 0;
+    auto result3_v    = Kokkos::View<int64_t, MemorySpace>{"result3_v"};
+    Kokkos::parallel_reduce("int_combined-reduce_mixed",
+                            Kokkos::RangePolicy<TEST_EXECSPACE>(0, nw),
+                            functor_type(nw), result1_v, result2,
+                            Kokkos::Sum<int64_t, MemorySpace>{result3_v});
+    int64_t result1;
+    Kokkos::deep_copy(result1, result1_v);
+    ASSERT_EQ(int64_t(nw), result1);
+    ASSERT_EQ(int64_t(nsum), result2);
+    int64_t result3;
+    Kokkos::deep_copy(result3, result3_v);
+    ASSERT_EQ(int64_t(nsum), result3);
+  }
 }
 #endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index 4664f2655..2217b9b8a 100644
--- a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <limits>
@@ -73,11 +72,6 @@ struct AddPlus {
   KOKKOS_INLINE_FUNCTION
   void join(value_type& dest, const value_type& src) const { dest += src + 1; }
 
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
-    dest += src + 1;
-  }
-
   // Optional.
   KOKKOS_INLINE_FUNCTION
   void init(value_type& val) const { val = value_type(); }
@@ -195,9 +189,7 @@ struct FunctorScalarJoin<0> {
   void operator()(const int& i, double& update) const { update += i; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 };
 
 template <>
@@ -214,9 +206,7 @@ struct FunctorScalarJoin<1> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 };
 
 template <int ISTEAM>
@@ -232,9 +222,7 @@ struct FunctorScalarJoinFinal<0> {
   void operator()(const int& i, double& update) const { update += i; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
@@ -254,9 +242,7 @@ struct FunctorScalarJoinFinal<1> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
@@ -275,9 +261,7 @@ struct FunctorScalarJoinInit<0> {
   void operator()(const int& i, double& update) const { update += i; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void init(double& update) const { update = 0.0; }
@@ -297,9 +281,7 @@ struct FunctorScalarJoinInit<1> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void init(double& update) const { update = 0.0; }
@@ -318,9 +300,7 @@ struct FunctorScalarJoinFinalInit<0> {
   void operator()(const int& i, double& update) const { update += i; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
@@ -343,9 +323,7 @@ struct FunctorScalarJoinFinalInit<1> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
-    dst += update;
-  }
+  void join(double& dst, const double& update) const { dst += update; }
 
   KOKKOS_INLINE_FUNCTION
   void final(double& update) const { result() = update; }
@@ -379,7 +357,7 @@ struct Functor2 {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double dst[], const volatile double src[]) const {
+  void join(double dst[], const double src[]) const {
     for (unsigned i = 0; i < value_count; ++i) dst[i] += src[i];
   }
 };
@@ -545,9 +523,9 @@ struct TestReduceCombinatoricalInstantiation {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
     AddLambdaRange(
         N,
-        typename std::conditional<
+        std::conditional_t<
             std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value,
-            void*, Kokkos::InvalidType>::type(),
+            void*, Kokkos::InvalidType>(),
         args...);
 #endif
   }
@@ -558,9 +536,9 @@ struct TestReduceCombinatoricalInstantiation {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
     AddLambdaTeam(
         N,
-        typename std::conditional<
+        std::conditional_t<
             std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value,
-            void*, Kokkos::InvalidType>::type(),
+            void*, Kokkos::InvalidType>(),
         args...);
 #endif
   }
diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp
index 758422794..10e0f3e95 100644
--- a/packages/kokkos/core/unit_test/TestReducers.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <limits>
@@ -296,8 +295,24 @@ struct TestReducers {
     Scalar reference_sum = 0;
 
     for (int i = 0; i < N; i++) {
-      int denom   = sizeof(Scalar) <= 2 ? 10 : 100;
-      h_values(i) = (Scalar)(rand() % denom);
+      int denom = sizeof(Scalar) <= 2 ? 10 : 100;
+      // clang-format off
+      // For bhalf, we start overflowing integer values at 2^8.
+      //            after 2^8,  we lose representation of odd numbers;
+      //            after 2^9,  we lose representation of odd and even numbers in position 1.
+      //            after 2^10, we lose representation of odd and even numbers in position 1-3.
+      //            after 2^11, we lose representation of odd and even numbers in position 1-7.
+      //            ...
+      // Generally, for IEEE 754 floating point numbers, we start this overflow pattern at: 2^(num_fraction_bits+1).
+      // brain float has num_fraction_bits = 7.
+      // This mask addresses #4719 for N <= 51.
+      // The mask is not needed for N <= 25.
+      // clang-format on
+      int mask =
+          std::is_same<Scalar, Kokkos::Experimental::bhalf_t>::value && N > 25
+              ? (int)0xfffffffe
+              : (int)0xffffffff;
+      h_values(i) = (Scalar)((rand() % denom) & mask);
       reference_sum += h_values(i);
     }
     Kokkos::deep_copy(values, h_values);
@@ -314,19 +329,19 @@ struct TestReducers {
 
       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f,
                               reducer_scalar);
-      ASSERT_EQ(sum_scalar, init);
+      ASSERT_EQ(sum_scalar, init) << "N: " << N;
 
       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f,
                               reducer_scalar);
-      ASSERT_EQ(sum_scalar, reference_sum);
+      ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N;
 
       sum_scalar = init;
       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N),
                               f_tag, reducer_scalar);
-      ASSERT_EQ(sum_scalar, reference_sum);
+      ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N;
 
       Scalar sum_scalar_view = reducer_scalar.reference();
-      ASSERT_EQ(sum_scalar_view, reference_sum);
+      ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N;
     }
 
     {
@@ -337,16 +352,16 @@ struct TestReducers {
                               reducer_view);
       Kokkos::fence();
       Scalar sum_view_scalar = sum_view();
-      ASSERT_EQ(sum_view_scalar, init);
+      ASSERT_EQ(sum_view_scalar, init) << "N: " << N;
 
       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f,
                               reducer_view);
       Kokkos::fence();
       sum_view_scalar = sum_view();
-      ASSERT_EQ(sum_view_scalar, reference_sum);
+      ASSERT_EQ(sum_view_scalar, reference_sum) << "N: " << N;
 
       Scalar sum_view_view = reducer_view.reference();
-      ASSERT_EQ(sum_view_view, reference_sum);
+      ASSERT_EQ(sum_view_view, reference_sum) << "N: " << N;
     }
 
     {
@@ -359,13 +374,13 @@ struct TestReducers {
       Kokkos::fence();
       Scalar sum_view_scalar;
       Kokkos::deep_copy(sum_view_scalar, sum_view);
-      ASSERT_EQ(sum_view_scalar, init);
+      ASSERT_EQ(sum_view_scalar, init) << "N: " << N;
 
       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f,
                               reducer_view);
       Kokkos::fence();
       Kokkos::deep_copy(sum_view_scalar, sum_view);
-      ASSERT_EQ(sum_view_scalar, reference_sum);
+      ASSERT_EQ(sum_view_scalar, reference_sum) << "N: " << N;
     }
   }
 
diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp
index 67f30e6cf..a84a6e20f 100644
--- a/packages/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp
@@ -79,50 +79,19 @@ TEST(TEST_CATEGORY, reducers_half_t) {
   TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
 }
 
-// TODO: File a bug report for this?
-// This fails on the CUDA-11.0-NVCC-C++17-RDC CI check.
-// TEST(TEST_CATEGORY, openmp_cuda11_reduction_bug_with_bhalf_t) {
-//  using ThisTestType = Kokkos::Experimental::bhalf_t;
-//  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(50);
-//  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(51);
-//  // For some reason commenting out reductions of 52,53,54,55 causes
-//  // the reduction of 56 to fail on OpenMP with Cuda/11.0
-//  //TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(52);
-//  //TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(53);
-//  //TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(54);
-//  //TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(55);
-//  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(56);
-//}
-
 TEST(TEST_CATEGORY, reducers_bhalf_t) {
-#if defined(KOKKOS_ENABLE_OPENMP)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::OpenMP>::value)
-#else
-  if (true)
-#endif  // ENABLE_OPENMP
-  {
-    using ThisTestType = Kokkos::Experimental::bhalf_t;
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(50);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(51);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(52);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(53);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(54);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(55);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(56);
-    // TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(57);
-    // This could be 57 on device but there seems to be a loss of precision when
-    // running on OpenMP with Cuda/11.0
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
-#if (CUDA_VERSION < 11000)
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
-    TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(21);
-#endif
-  } else {
-    GTEST_SKIP();
-  }
+  using ThisTestType = Kokkos::Experimental::bhalf_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(25);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(50);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(51);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
 }
 
 TEST(TEST_CATEGORY, reducers_int8_t) {
diff --git a/packages/kokkos/core/unit_test/TestResize.hpp b/packages/kokkos/core/unit_test/TestResize.hpp
index cf5c0df6f..cd1fde4a9 100644
--- a/packages/kokkos/core/unit_test/TestResize.hpp
+++ b/packages/kokkos/core/unit_test/TestResize.hpp
@@ -71,81 +71,81 @@ void impl_testResize() {
     using view_type = Kokkos::View<int*, DeviceType>;
     view_type view_1d("view_1d", sizes[0]);
     const int* oldPointer = view_1d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_1d, sizes[0]);
     const int* newPointer = view_1d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int**, DeviceType>;
     view_type view_2d("view_2d", sizes[0], sizes[1]);
     const int* oldPointer = view_2d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_2d, sizes[0], sizes[1]);
     const int* newPointer = view_2d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int***, DeviceType>;
     view_type view_3d("view_3d", sizes[0], sizes[1], sizes[2]);
     const int* oldPointer = view_3d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_3d, sizes[0], sizes[1], sizes[2]);
     const int* newPointer = view_3d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int****, DeviceType>;
     view_type view_4d("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]);
     const int* oldPointer = view_4d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_4d, sizes[0], sizes[1], sizes[2], sizes[3]);
     const int* newPointer = view_4d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int*****, DeviceType>;
     view_type view_5d("view_5d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4]);
     const int* oldPointer = view_5d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_5d, sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4]);
     const int* newPointer = view_5d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int******, DeviceType>;
     view_type view_6d("view_6d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5]);
     const int* oldPointer = view_6d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_6d, sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5]);
     const int* newPointer = view_6d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int*******, DeviceType>;
     view_type view_7d("view_7d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5], sizes[6]);
     const int* oldPointer = view_7d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_7d, sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5], sizes[6]);
     const int* newPointer = view_7d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   {
     using view_type = Kokkos::View<int********, DeviceType>;
     view_type view_8d("view_8d", sizes[0], sizes[1], sizes[2], sizes[3],
                       sizes[4], sizes[5], sizes[6], sizes[7]);
     const int* oldPointer = view_8d.data();
-    EXPECT_TRUE(oldPointer != nullptr);
+    EXPECT_NE(oldPointer, nullptr);
     resize_dispatch(Tag{}, view_8d, sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5], sizes[6], sizes[7]);
     const int* newPointer = view_8d.data();
-    EXPECT_TRUE(oldPointer == newPointer);
+    EXPECT_EQ(oldPointer, newPointer);
   }
   // Resize without initialization: check if data preserved
   {
@@ -156,7 +156,7 @@ void impl_testResize() {
     Kokkos::deep_copy(view_1d, 111);
     Kokkos::deep_copy(h_view_1d_old, view_1d);
     resize_dispatch(Tag{}, view_1d, 2 * sizes[0]);
-    EXPECT_TRUE(view_1d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_1d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_1d =
         Kokkos::create_mirror_view(view_1d);
     Kokkos::deep_copy(h_view_1d, view_1d);
@@ -167,7 +167,7 @@ void impl_testResize() {
         break;
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int**, DeviceType>;
@@ -177,7 +177,7 @@ void impl_testResize() {
     Kokkos::deep_copy(view_2d, 222);
     Kokkos::deep_copy(h_view_2d_old, view_2d);
     resize_dispatch(Tag{}, view_2d, 2 * sizes[0], sizes[1]);
-    EXPECT_TRUE(view_2d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_2d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_2d =
         Kokkos::create_mirror_view(view_2d);
     Kokkos::deep_copy(h_view_2d, view_2d);
@@ -190,7 +190,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int***, DeviceType>;
@@ -200,7 +200,7 @@ void impl_testResize() {
     Kokkos::deep_copy(view_3d, 333);
     Kokkos::deep_copy(h_view_3d_old, view_3d);
     resize_dispatch(Tag{}, view_3d, 2 * sizes[0], sizes[1], sizes[2]);
-    EXPECT_TRUE(view_3d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_3d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_3d =
         Kokkos::create_mirror_view(view_3d);
     Kokkos::deep_copy(h_view_3d, view_3d);
@@ -215,7 +215,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int****, DeviceType>;
@@ -225,7 +225,7 @@ void impl_testResize() {
     Kokkos::deep_copy(view_4d, 444);
     Kokkos::deep_copy(h_view_4d_old, view_4d);
     resize_dispatch(Tag{}, view_4d, 2 * sizes[0], sizes[1], sizes[2], sizes[3]);
-    EXPECT_TRUE(view_4d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_4d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_4d =
         Kokkos::create_mirror_view(view_4d);
     Kokkos::deep_copy(h_view_4d, view_4d);
@@ -242,7 +242,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int*****, DeviceType>;
@@ -254,7 +254,7 @@ void impl_testResize() {
     Kokkos::deep_copy(h_view_5d_old, view_5d);
     resize_dispatch(Tag{}, view_5d, 2 * sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4]);
-    EXPECT_TRUE(view_5d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_5d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_5d =
         Kokkos::create_mirror_view(view_5d);
     Kokkos::deep_copy(h_view_5d, view_5d);
@@ -274,7 +274,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int******, DeviceType>;
@@ -286,7 +286,7 @@ void impl_testResize() {
     Kokkos::deep_copy(h_view_6d_old, view_6d);
     resize_dispatch(Tag{}, view_6d, 2 * sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5]);
-    EXPECT_TRUE(view_6d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_6d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_6d =
         Kokkos::create_mirror_view(view_6d);
     Kokkos::deep_copy(h_view_6d, view_6d);
@@ -308,7 +308,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int*******, DeviceType>;
@@ -320,7 +320,7 @@ void impl_testResize() {
     Kokkos::deep_copy(h_view_7d_old, view_7d);
     resize_dispatch(Tag{}, view_7d, 2 * sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5], sizes[6]);
-    EXPECT_TRUE(view_7d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_7d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_7d =
         Kokkos::create_mirror_view(view_7d);
     Kokkos::deep_copy(h_view_7d, view_7d);
@@ -344,7 +344,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
   {
     using view_type = Kokkos::View<int********, DeviceType>;
@@ -356,7 +356,7 @@ void impl_testResize() {
     Kokkos::deep_copy(h_view_8d_old, view_8d);
     resize_dispatch(Tag{}, view_8d, 2 * sizes[0], sizes[1], sizes[2], sizes[3],
                     sizes[4], sizes[5], sizes[6], sizes[7]);
-    EXPECT_TRUE(view_8d.extent(0) == 2 * sizes[0]);
+    EXPECT_EQ(view_8d.extent(0), 2 * sizes[0]);
     typename view_type::HostMirror h_view_8d =
         Kokkos::create_mirror_view(view_8d);
     Kokkos::deep_copy(h_view_8d, view_8d);
@@ -382,7 +382,7 @@ void impl_testResize() {
         }
       }
     }
-    EXPECT_TRUE(test == true);
+    EXPECT_TRUE(test);
   }
 }
 
diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp
index 67cb85553..1a4056af0 100644
--- a/packages/kokkos/core/unit_test/TestScan.hpp
+++ b/packages/kokkos/core/unit_test/TestScan.hpp
@@ -88,8 +88,7 @@ struct TestScan {
   void init(value_type& update) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update,
-            volatile const value_type& input) const {
+  void join(value_type& update, const value_type& input) const {
     update += input;
   }
 
diff --git a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
index 46534eeb1..f66b35dc9 100644
--- a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
+++ b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestStringManipulation.cpp b/packages/kokkos/core/unit_test/TestStringManipulation.cpp
new file mode 100644
index 000000000..92b2afa47
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestStringManipulation.cpp
@@ -0,0 +1,217 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_StringManipulation.hpp>
+#include <climits>
+
+namespace {
+
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+KOKKOS_FUNCTION constexpr bool test_strlen() {
+  using Kokkos::Impl::strlen;
+  constexpr char str[] = "How many characters does this string contain?";
+  STATIC_ASSERT(strlen(str) == 45);  // without null character
+  STATIC_ASSERT(sizeof str == 46);   // with null character
+  STATIC_ASSERT(strlen("") == 0);
+  return true;
+}
+STATIC_ASSERT(test_strlen());
+
+KOKKOS_FUNCTION constexpr bool test_strcmp() {
+  using Kokkos::Impl::strcmp;
+  constexpr char cat1[] = "Heathcliff";
+  constexpr char cat2[] = "Snagglepuss";
+  constexpr char cat3[] = "Hobbes";
+  constexpr char cat4[] = "Garfield";
+  STATIC_ASSERT(strcmp(cat1, cat1) == 0);
+#if (!defined(KOKKOS_COMPILER_NVCC) ||                                 \
+     ((__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 3))) && \
+    (!defined(__INTEL_COMPILER_BUILD_DATE) ||                          \
+     (__INTEL_COMPILER_BUILD_DATE >= 20210228))
+  STATIC_ASSERT(strcmp(cat1, cat2) < 0);
+  STATIC_ASSERT(strcmp(cat1, cat3) < 0);
+#endif
+  STATIC_ASSERT(strcmp(cat1, cat4) > 0);
+  STATIC_ASSERT(strcmp(cat2, cat2) == 0);
+  STATIC_ASSERT(strcmp(cat2, cat3) > 0);
+  STATIC_ASSERT(strcmp(cat2, cat4) > 0);
+  STATIC_ASSERT(strcmp(cat3, cat3) == 0);
+  STATIC_ASSERT(strcmp(cat3, cat4) > 0);
+  STATIC_ASSERT(strcmp(cat4, cat4) == 0);
+  return true;
+}
+STATIC_ASSERT(test_strcmp());
+
+KOKKOS_FUNCTION constexpr bool test_strncmp() {
+  using Kokkos::Impl::strncmp;
+  constexpr char greet1[] = "Hello, world!";
+  constexpr char greet2[] = "Hello, everybody!";
+  constexpr char greet3[] = "Hello, somebody!";
+  STATIC_ASSERT(strncmp(greet1, greet2, 13) > 0);
+  STATIC_ASSERT(strncmp(greet2, greet1, 13) < 0);
+  STATIC_ASSERT(strncmp(greet2, greet1, 7) == 0);
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 610)
+  (void)greet3;
+#elif defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 710)
+  STATIC_ASSERT(strncmp(&greet2[12], &greet3[11], 5) == 0);
+#else
+  STATIC_ASSERT(strncmp(greet2 + 12, greet3 + 11, 5) == 0);
+#endif
+  STATIC_ASSERT(strncmp(greet1, greet2, 0) == 0);
+  return true;
+}
+STATIC_ASSERT(test_strncmp());
+
+KOKKOS_FUNCTION constexpr bool strcpy_helper(const char* dest, const char* src,
+                                             const char* ref) {
+  using Kokkos::Impl::strcmp;
+  using Kokkos::Impl::strcpy;
+  char buffer[50] = {};
+  strcpy(buffer, dest);
+  strcpy(buffer, src);
+  return strcmp(buffer, ref) == 0;
+}
+
+KOKKOS_FUNCTION constexpr bool test_strcpy() {
+  STATIC_ASSERT(strcpy_helper("abcdef", "hi", "hi\0\0\0f"));
+  return true;
+}
+STATIC_ASSERT(test_strcpy());
+
+KOKKOS_FUNCTION constexpr bool strncpy_helper(const char* dest, const char* src,
+                                              std::size_t count,
+                                              const char* ref) {
+  using Kokkos::Impl::strcmp;
+  using Kokkos::Impl::strlen;
+  using Kokkos::Impl::strncpy;
+  char buffer[50] = {};
+  strncpy(buffer, dest, strlen(dest));
+  strncpy(buffer, src, count);
+  return strcmp(buffer, ref) == 0;
+}
+
+KOKKOS_FUNCTION constexpr bool test_strncpy() {
+  STATIC_ASSERT(strncpy_helper("abcdef", "hi", 5, "hi\0\0\0f"));
+  STATIC_ASSERT(strncpy_helper("abcdef", "hi", 0, "abcdef"));
+  return true;
+}
+STATIC_ASSERT(test_strncpy());
+
+KOKKOS_FUNCTION constexpr bool strcat_helper(const char* dest, const char* src,
+                                             const char* ref) {
+  using Kokkos::Impl::strcat;
+  using Kokkos::Impl::strcmp;
+  char buffer[50] = {};
+  strcat(buffer, dest);
+  strcat(buffer, src);
+  return strcmp(buffer, ref) == 0;
+}
+
+KOKKOS_FUNCTION constexpr bool test_strcat() {
+  STATIC_ASSERT(strcat_helper("Hello ", "World!", "Hello World!"));
+  STATIC_ASSERT(strcat_helper("Hello World!", " Goodbye World!",
+                              "Hello World! Goodbye World!"));
+  return true;
+}
+STATIC_ASSERT(test_strcat());
+
+KOKKOS_FUNCTION constexpr bool strncat_helper(const char* dest, const char* src,
+                                              std::size_t count,
+                                              const char* ref) {
+  using Kokkos::Impl::strcmp;
+  using Kokkos::Impl::strlen;
+  using Kokkos::Impl::strncat;
+  char buffer[50] = {};
+  strncat(buffer, dest, strlen(dest));
+  strncat(buffer, src, count);
+  return strcmp(buffer, ref) == 0;
+}
+
+KOKKOS_FUNCTION constexpr bool test_strncat() {
+  STATIC_ASSERT(
+      strncat_helper("Hello World!", " Goodbye World!", 3, "Hello World! Go"));
+  STATIC_ASSERT(
+      strncat_helper("Hello World!", " Goodbye World!", 0, "Hello World!"));
+  return true;
+}
+STATIC_ASSERT(test_strncat());
+
+#if !defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 540)
+template <class Integral>
+KOKKOS_FUNCTION constexpr bool to_chars_helper(Integral val, char const* ref) {
+  using Kokkos::Impl::strcmp;
+  using Kokkos::Impl::strlen;
+  using Kokkos::Impl::to_chars_i;
+  constexpr int BUFFER_SIZE = 21;
+  char buffer[BUFFER_SIZE]  = {};
+  return (buffer + strlen(ref) ==
+          to_chars_i(buffer, buffer + BUFFER_SIZE, val).ptr) &&
+         (strcmp(buffer, ref) == 0);
+}
+
+KOKKOS_FUNCTION constexpr bool test_to_chars() {
+  STATIC_ASSERT(to_chars_helper(0, "0"));
+  STATIC_ASSERT(to_chars_helper(123, "123"));
+  STATIC_ASSERT(to_chars_helper(-456, "-456"));
+  STATIC_ASSERT(to_chars_helper(INT_MAX, "2147483647"));
+  STATIC_ASSERT(to_chars_helper(INT_MIN, "-2147483648"));
+
+  STATIC_ASSERT(to_chars_helper(0u, "0"));
+  STATIC_ASSERT(to_chars_helper(78u, "78"));
+  STATIC_ASSERT(to_chars_helper(UINT_MAX, "4294967295"));
+
+  STATIC_ASSERT(to_chars_helper(0ll, "0"));
+  STATIC_ASSERT(to_chars_helper(LLONG_MAX, "9223372036854775807"));
+  STATIC_ASSERT(to_chars_helper(LLONG_MIN, "-9223372036854775808"));
+
+  STATIC_ASSERT(to_chars_helper(0ull, "0"));
+  STATIC_ASSERT(to_chars_helper(ULLONG_MAX, "18446744073709551615"));
+
+  return true;
+}
+STATIC_ASSERT(test_to_chars());
+#endif
+
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
index 6b9cd2c90..62fd68b6c 100644
--- a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -48,7 +48,6 @@
 #include <Kokkos_Macros.hpp>
 #if defined(KOKKOS_ENABLE_TASKDAG)
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_FixedBufferMemoryPool.hpp>
 #include <cstdio>
 #include <iostream>
 #include <cmath>
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index cade6b024..f1d0f9cb3 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -43,7 +43,6 @@
 */
 
 #include <cstdio>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
@@ -62,17 +61,20 @@ struct TestTeamPolicy {
   view_type m_flags;
 
   TestTeamPolicy(const size_t league_size)
-      : m_flags(
-            Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
+      : m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
   // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-            Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 32).team_size_max(
-                *this, Kokkos::ParallelReduceTag()),
+                Kokkos::TeamPolicy<ScheduleType, ExecSpace>(
+                    1, std::is_same<ExecSpace,
+                                    Kokkos::Experimental::OpenMPTarget>::value
+                           ? 32
+                           : 1)
+                    .team_size_max(*this, Kokkos::ParallelReduceTag()),
 #else
-            Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
-                *this, Kokkos::ParallelReduceTag()),
+                Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
+                    *this, Kokkos::ParallelReduceTag()),
 #endif
-            league_size) {
+                league_size) {
   }
 
   struct VerifyInitTag {};
@@ -131,8 +133,12 @@ struct TestTeamPolicy {
     constexpr const int smallest_work = 1;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(smallest_work, 32,
-                                                     smallest_work);
+    Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
+        smallest_work,
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? 32
+            : smallest_work,
+        smallest_work);
 #else
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
@@ -143,8 +149,12 @@ struct TestTeamPolicy {
     (void)both_auto;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
-                                                       Kokkos::AUTO());
+    Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
+        smallest_work,
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? 32
+            : smallest_work,
+        Kokkos::AUTO());
 #else
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
@@ -166,10 +176,18 @@ struct TestTeamPolicy {
       // 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
       const int team_size =
-          policy_type(league_size, 32)
+          policy_type(
+              league_size,
+              std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+                  ? 32
+                  : 1)
               .team_size_max(functor, Kokkos::ParallelForTag());
       const int team_size_init =
-          policy_type_init(league_size, 32)
+          policy_type_init(
+              league_size,
+              std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+                  ? 32
+                  : 1)
               .team_size_max(functor, Kokkos::ParallelForTag());
 #else
       const int team_size =
@@ -215,7 +233,11 @@ struct TestTeamPolicy {
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     const int team_size =
-        policy_type_reduce(league_size, 32)
+        policy_type_reduce(
+            league_size,
+            std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+                ? 32
+                : 1)
             .team_size_max(functor, Kokkos::ParallelReduceTag());
 #else
     const int team_size =
@@ -272,7 +294,7 @@ class ReduceTeamFunctor {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &dst, const volatile value_type &src) const {
+  void join(value_type &dst, const value_type &src) const {
     dst.value[0] += src.value[0];
     dst.value[1] += src.value[1];
     dst.value[2] += src.value[2];
@@ -371,8 +393,7 @@ class ScanTeamFunctor {
   void init(value_type &error) const { error = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(value_type volatile &error,
-            value_type volatile const &input) const {
+  void join(value_type &error, value_type const &input) const {
     if (input) error = 1;
   }
 
@@ -380,8 +401,7 @@ class ScanTeamFunctor {
     using value_type = int64_t;
 
     KOKKOS_INLINE_FUNCTION
-    void join(value_type volatile &dst,
-              value_type volatile const &input) const {
+    void join(value_type &dst, value_type const &input) const {
       if (dst < input) dst = input;
     }
   };
@@ -571,11 +591,17 @@ struct TestSharedTeam {
 
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     const size_t team_size =
-        Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
-            Functor(), Kokkos::ParallelReduceTag());
-
-    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(32 / team_size,
-                                                          team_size);
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
+                  Functor(), Kokkos::ParallelReduceTag())
+            : Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1)
+                  .team_size_max(Functor(), Kokkos::ParallelReduceTag());
+
+    Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? 32 / team_size
+            : 8192 / team_size,
+        team_size);
 #else
     const size_t team_size =
         Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
@@ -616,7 +642,9 @@ struct TestLambdaSharedTeam {
 
     const int SHARED_COUNT = 1000;
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    int team_size = 32;
+    int team_size =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
+                                                                           : 1;
 #else
     int team_size = 1;
 #endif
@@ -776,11 +804,18 @@ struct TestScratchTeam {
         Functor::SHARED_THREAD_COUNT);
 
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    p_type team_exec = p_type(64, 32).set_scratch_size(
-        1,
-        Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
-            Functor::SHARED_TEAM_COUNT)),
-        Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
+    p_type team_exec =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? p_type(64, 32).set_scratch_size(
+                  1,
+                  Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
+                      Functor::SHARED_TEAM_COUNT)),
+                  Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)))
+            : p_type(8192, 1).set_scratch_size(
+                  1,
+                  Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
+                      Functor::SHARED_TEAM_COUNT)),
+                  Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
 #else
     p_type team_exec = p_type(8192, 1).set_scratch_size(
         1,
@@ -797,7 +832,10 @@ struct TestScratchTeam {
         Functor::shared_int_array_type::shmem_size(3 * team_size);
 
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    team_exec = p_type(64 / team_size, team_size);
+    team_exec =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
+            ? p_type(64 / team_size, team_size)
+            : p_type(8192 / team_size, team_size);
 #else
     team_exec          = p_type(8192 / team_size, team_size);
 #endif
@@ -825,31 +863,31 @@ namespace Test {
 template <class ExecSpace>
 KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
     const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_team1(team.team_scratch(0), 128);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_thread1(team.thread_scratch(0), 16);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_team2(team.team_scratch(0), 128);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_thread2(team.thread_scratch(0), 16);
 
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_team1(team.team_scratch(1), 12800);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_thread1(team.thread_scratch(1), 1600);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_team2(team.team_scratch(1), 12800);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_thread2(team.thread_scratch(1), 1600);
 
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_team3(team.team_scratch(0), 128);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       a_thread3(team.thread_scratch(0), 16);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_team3(team.team_scratch(1), 12800);
-  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+  Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       b_thread3(team.thread_scratch(1), 1600);
 
   // The explicit types for 0 and 128 are here to test TeamThreadRange accepting
@@ -945,7 +983,7 @@ struct ClassNoShmemSizeFunction {
   using member_type =
       typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
 
-  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TagFor &, const member_type &team) const {
@@ -967,20 +1005,20 @@ struct ClassNoShmemSizeFunction {
     const int per_team0 =
         3 *
         Kokkos::View<double *, ExecSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
     const int per_thread0 =
         3 *
         Kokkos::View<double *, ExecSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
 
     const int per_team1 =
         3 * Kokkos::View<
                 double *, ExecSpace,
-                Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
+                Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
     const int per_thread1 =
-        3 * Kokkos::View<
-                double *, ExecSpace,
-                Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
+        3 *
+        Kokkos::View<double *, ExecSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
 
 #ifdef KOKKOS_ENABLE_SYCL
     int team_size = 4;
@@ -1031,7 +1069,7 @@ struct ClassWithShmemSizeFunction {
   using member_type =
       typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
 
-  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TagFor &, const member_type &team) const {
@@ -1053,11 +1091,11 @@ struct ClassWithShmemSizeFunction {
     const int per_team1 =
         3 * Kokkos::View<
                 double *, ExecSpace,
-                Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
+                Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
     const int per_thread1 =
-        3 * Kokkos::View<
-                double *, ExecSpace,
-                Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
+        3 *
+        Kokkos::View<double *, ExecSpace,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
 
     int team_size = 8;
     if (team_size > ExecSpace::concurrency())
@@ -1097,11 +1135,11 @@ struct ClassWithShmemSizeFunction {
     const int per_team0 =
         3 *
         Kokkos::View<double *, ExecSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
     const int per_thread0 =
         3 *
         Kokkos::View<double *, ExecSpace,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
     return per_team0 + team_size * per_thread0;
   }
 };
@@ -1109,27 +1147,26 @@ struct ClassWithShmemSizeFunction {
 template <class ExecSpace, class ScheduleType>
 void test_team_mulit_level_scratch_test_lambda() {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
   Kokkos::View<int, ExecSpace> d_errors("Errors");
   errors = d_errors;
 
   const int per_team0 =
       3 *
       Kokkos::View<double *, ExecSpace,
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
   const int per_thread0 =
-      3 *
-      Kokkos::View<double *, ExecSpace,
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
+      3 * Kokkos::View<double *, ExecSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
 
   const int per_team1 =
       3 *
       Kokkos::View<double *, ExecSpace,
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
   const int per_thread1 =
       3 *
       Kokkos::View<double *, ExecSpace,
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
 
 #ifdef KOKKOS_ENABLE_SYCL
   int team_size = 4;
@@ -1244,9 +1281,8 @@ template <class ExecSpace, class ScheduleType, class T, class Enabled = void>
 struct TestTeamBroadcast;
 
 template <class ExecSpace, class ScheduleType, class T>
-struct TestTeamBroadcast<
-    ExecSpace, ScheduleType, T,
-    typename std::enable_if<(sizeof(T) == sizeof(char)), void>::type> {
+struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
+                         std::enable_if_t<(sizeof(T) == sizeof(char)), void>> {
   using team_member =
       typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
   using memory_space = typename ExecSpace::memory_space;
@@ -1358,9 +1394,8 @@ struct TestTeamBroadcast<
 };
 
 template <class ExecSpace, class ScheduleType, class T>
-struct TestTeamBroadcast<
-    ExecSpace, ScheduleType, T,
-    typename std::enable_if<(sizeof(T) > sizeof(char)), void>::type> {
+struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
+                         std::enable_if_t<(sizeof(T) > sizeof(char)), void>> {
   using team_member =
       typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
   using value_type = T;
@@ -1434,9 +1469,8 @@ struct TestTeamBroadcast<
   }
 
   template <class ScalarType>
-  static inline
-      typename std::enable_if<!std::is_integral<ScalarType>::value, void>::type
-      compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
+  static inline std::enable_if_t<!std::is_integral<ScalarType>::value, void>
+  compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
     if (std::is_same<ScalarType, double>::value ||
         std::is_same<ScalarType, float>::value) {
       ASSERT_NEAR((double)A, (double)B,
@@ -1448,9 +1482,8 @@ struct TestTeamBroadcast<
   }
 
   template <class ScalarType>
-  static inline
-      typename std::enable_if<std::is_integral<ScalarType>::value, void>::type
-      compare_test(ScalarType A, ScalarType B, double) {
+  static inline std::enable_if_t<std::is_integral<ScalarType>::value, void>
+  compare_test(ScalarType A, ScalarType B, double) {
     ASSERT_EQ(A, B);
   }
 
@@ -1528,7 +1561,9 @@ struct TestScratchAlignment {
   void test(bool allocate_small) {
     int shmem_size = ScratchView::shmem_size(11);
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
-    int team_size = 32;
+    int team_size =
+        std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
+                                                                           : 1;
 #else
     int team_size      = 1;
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
index 1582783a7..be1848d70 100644
--- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -114,6 +114,54 @@ TEST(TEST_CATEGORY, team_reduce_large) {
   }
 }
 
+template <typename ExecutionSpace>
+struct LargeTeamScratchFunctor {
+  using team_member = typename Kokkos::TeamPolicy<ExecutionSpace>::member_type;
+  const size_t m_per_team_bytes;
+
+  KOKKOS_FUNCTION void operator()(const team_member& member) const {
+    double* team_shared = static_cast<double*>(
+        member.team_scratch(/*level*/ 1).get_shmem(m_per_team_bytes));
+    if (team_shared == nullptr)
+      Kokkos::abort("Couldn't allocate required size!\n");
+    double* team_shared_1 = static_cast<double*>(
+        member.team_scratch(/*level*/ 1).get_shmem(sizeof(double)));
+    if (team_shared_1 != nullptr)
+      Kokkos::abort("Allocated more memory than requested!\n");
+  }
+};
+
+TEST(TEST_CATEGORY, large_team_scratch_size) {
+  const int level   = 1;
+  const int n_teams = 1;
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  // Allocate slightly more than (2^31-1) bytes. The other value resulted in
+  // problems allocating too much memory.
+  const size_t per_team_extent = 268435460;
+#else
+  // Value originally chosen in the reproducer.
+  const size_t per_team_extent = 502795560;
+#endif
+
+  const size_t per_team_bytes = per_team_extent * sizeof(double);
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  Kokkos::TeamPolicy<TEST_EXECSPACE> policy(
+      n_teams,
+      std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value
+          ? 32
+          : 1);
+#else
+  Kokkos::TeamPolicy<TEST_EXECSPACE> policy(n_teams, 1);
+#endif
+  policy.set_scratch_size(level, Kokkos::PerTeam(per_team_bytes));
+
+  Kokkos::parallel_for(policy,
+                       LargeTeamScratchFunctor<TEST_EXECSPACE>{per_team_bytes});
+  Kokkos::fence();
+}
+
 TEST(TEST_CATEGORY, team_broadcast_long) {
   TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
                     long>::test_teambroadcast(0, 1);
@@ -154,12 +202,6 @@ struct long_wrapper {
     lhs.value += rhs.value;
   }
 
-  KOKKOS_FUNCTION
-  friend void operator+=(volatile long_wrapper& lhs,
-                         const volatile long_wrapper& rhs) {
-    lhs.value += rhs.value;
-  }
-
   KOKKOS_FUNCTION
   void operator=(const long_wrapper& other) { value = other.value; }
 
diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 836134afe..469bba23b 100644
--- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -58,8 +58,7 @@ TEST(TEST_CATEGORY, team_reduction_scan) {
 }
 
 TEST(TEST_CATEGORY, team_long_reduce) {
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  // WORKAROUND OPENMPTARGET: Not implemented
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET: Not implemented
   if constexpr (!std::is_same<TEST_EXECSPACE,
                               Kokkos::Experimental::OpenMPTarget>::value)
 #endif
@@ -76,8 +75,7 @@ TEST(TEST_CATEGORY, team_long_reduce) {
 }
 
 TEST(TEST_CATEGORY, team_double_reduce) {
-#ifdef KOKKOS_ENABLE_OPENMPTARGET
-  // WORKAROUND OPENMPTARGET: Not implemented
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET: Not implemented
   if constexpr (!std::is_same<TEST_EXECSPACE,
                               Kokkos::Experimental::OpenMPTarget>::value)
 #endif
@@ -97,5 +95,44 @@ TEST(TEST_CATEGORY, team_double_reduce) {
   }
 }
 
+template <typename ExecutionSpace>
+struct DummyTeamReductionFunctor {
+  using TeamPolicy     = Kokkos::TeamPolicy<ExecutionSpace>;
+  using TeamHandleType = typename TeamPolicy::member_type;
+
+  KOKKOS_FUNCTION void operator()(const TeamHandleType&, double&) const {}
+};
+
+template <typename ExecutionSpace>
+void test_team_parallel_reduce(const int num_loop_size) {
+  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
+
+  using ReducerType = Kokkos::Sum<double>;
+  double result     = 10.;
+  ReducerType reducer(result);
+
+  const int bytes_per_team   = 0;
+  const int bytes_per_thread = 117;
+
+  TeamPolicy team_exec(num_loop_size, Kokkos::AUTO);
+  team_exec.set_scratch_size(1, Kokkos::PerTeam(bytes_per_team),
+                             Kokkos::PerThread(bytes_per_thread));
+
+  Kokkos::parallel_reduce(team_exec,
+                          DummyTeamReductionFunctor<ExecutionSpace>{}, reducer);
+  ASSERT_EQ(result, 0.);
+}
+
+TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET: Not implemented
+  if constexpr (!std::is_same<TEST_EXECSPACE,
+                              Kokkos::Experimental::OpenMPTarget>::value)
+#endif
+  {
+    test_team_parallel_reduce<TEST_EXECSPACE>(0);
+    test_team_parallel_reduce<TEST_EXECSPACE>(1);
+  }
+}
+
 }  // namespace Test
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeamScan.hpp b/packages/kokkos/core/unit_test/TestTeamScan.hpp
index 9edba57a0..b93285b21 100644
--- a/packages/kokkos/core/unit_test/TestTeamScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScan.hpp
@@ -92,18 +92,15 @@ struct TestTeamScan {
     N   = _N;
     a_d = view_type("a_d", M, N);
     a_r = view_type("a_r", M, N);
-    // Set team size explicitly to
-    // a) check whether this works in CPU backends with team_size > 1 and
-    // b) make sure we have a power of 2 and for GPU backends due to limitation
-    // of the scan algorithm implemented in CUDA etc.
-    int team_size = 1;
-    if (ExecutionSpace().concurrency() > 2) {
-      if (ExecutionSpace().concurrency() > 10000)
-        team_size = 128;
-      else
-        team_size = 3;
-    }
-    Kokkos::parallel_for(policy_type(M, team_size), *this);
+
+    // Set team size explicitly to check whether non-power-of-two team sizes can
+    // be used.
+    if (ExecutionSpace().concurrency() > 10000)
+      Kokkos::parallel_for(policy_type(M, 127), *this);
+    else if (ExecutionSpace().concurrency() > 2)
+      Kokkos::parallel_for(policy_type(M, 3), *this);
+    else
+      Kokkos::parallel_for(policy_type(M, 1), *this);
 
     auto a_i = Kokkos::create_mirror_view(a_d);
     auto a_o = Kokkos::create_mirror_view(a_r);
diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
index c0e698d92..5e637616c 100644
--- a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -43,7 +43,6 @@
 */
 
 #include <cstdio>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
@@ -64,14 +63,6 @@ class MyArray {
   void operator=(const MyArray& src) {
     for (int i = 0; i < N; i++) values[i] = src.values[i];
   }
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile MyArray& src) volatile {
-    for (int i = 0; i < N; i++) values[i] += src.values[i];
-  }
-  KOKKOS_INLINE_FUNCTION
-  void operator=(const volatile MyArray& src) volatile {
-    for (int i = 0; i < N; i++) values[i] = src.values[i];
-  }
 };
 
 template <class T, int N, class PolicyType, int S>
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
index dbed67475..8c302f930 100644
--- a/packages/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -1031,8 +1031,8 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
   using T = double;
   using namespace VectorScanReducer;
 
-  static constexpr int n              = 1000000;
-  static constexpr int n_vector_range = 100;
+  constexpr int n              = 1000000;
+  constexpr int n_vector_range = 100;
 
   checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
             Kokkos::Prod<T, TEST_EXECSPACE>>()
@@ -1054,6 +1054,9 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
   checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
             Kokkos::Min<T, TEST_EXECSPACE>>()
       .run();
+
+  (void)n;
+  (void)n_vector_range;
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
index c4116b913..44ffc7f3b 100644
--- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -78,37 +78,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  my_complex& operator=(const volatile my_complex& src) {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  volatile my_complex& operator=(const my_complex& src) volatile {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  volatile my_complex& operator=(const volatile my_complex& src) volatile {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  my_complex(const volatile my_complex& src) {
-    re    = src.re;
-    im    = src.im;
-    dummy = src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex(const double& val) {
     re    = val;
@@ -124,13 +93,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile my_complex& src) volatile {
-    re += src.re;
-    im += src.im;
-    dummy += src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex operator+(const my_complex& src) {
     my_complex tmp = *this;
@@ -140,15 +102,6 @@ struct my_complex {
     return tmp;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  my_complex operator+(const volatile my_complex& src) volatile {
-    my_complex tmp = *this;
-    tmp.re += src.re;
-    tmp.im += src.im;
-    tmp.dummy += src.dummy;
-    return tmp;
-  }
-
   KOKKOS_INLINE_FUNCTION
   my_complex& operator*=(const my_complex& src) {
     double re_tmp = re * src.re - im * src.im;
@@ -159,15 +112,6 @@ struct my_complex {
     return *this;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator*=(const volatile my_complex& src) volatile {
-    double re_tmp = re * src.re - im * src.im;
-    double im_tmp = re * src.im + im * src.re;
-    re            = re_tmp;
-    im            = im_tmp;
-    dummy *= src.dummy;
-  }
-
   KOKKOS_INLINE_FUNCTION
   bool operator==(const my_complex& src) const {
     return (re == src.re) && (im == src.im) && (dummy == src.dummy);
diff --git a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
deleted file mode 100644
index a0d00ded1..000000000
--- a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core.hpp>
-
-#define KOKKOS_PRAGMA_UNROLL(a)
-
-namespace {
-
-template <class Scalar, class ExecutionSpace>
-struct SumPlain {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-
-  type view;
-
-  SumPlain(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, Scalar& val) { val += Scalar(); }
-};
-
-template <class Scalar, class ExecutionSpace>
-struct SumInitJoinFinalValueType {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumInitJoinFinalValueType(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(value_type& val) const { val = value_type(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, volatile value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
-template <class Scalar, class ExecutionSpace>
-struct SumInitJoinFinalValueType2 {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumInitJoinFinalValueType2(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(volatile value_type& val) const { val = value_type(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const volatile value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
-template <class Scalar, class ExecutionSpace>
-struct SumInitJoinFinalValueTypeArray {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar[];
-
-  type view;
-  int n;
-
-  SumInitJoinFinalValueTypeArray(type view_, int n_) : view(view_), n(n_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(value_type val) const {
-    for (int k = 0; k < n; k++) {
-      val[k] = 0;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type val, const volatile value_type src) const {
-    for (int k = 0; k < n; k++) {
-      val[k] += src[k];
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int i, value_type val) const {
-    for (int k = 0; k < n; k++) {
-      val[k] += k * i;
-    }
-  }
-};
-
-template <class Scalar, class ExecutionSpace>
-void TestTemplateMetaFunctions() {
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasInit<SumPlain<Scalar, ExecutionSpace>,
-                                         Scalar&>::value == false,
-      "");
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasInit<
-          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
-      "");
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasInit<
-          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
-      "");
-
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasInit<
-          SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>>::value ==
-          true,
-      "");
-
-  static_assert(Kokkos::Impl::ReduceFunctorHasJoin<
-                    SumPlain<Scalar, ExecutionSpace>>::value == false,
-                "");
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasJoin<
-          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
-      "");
-  static_assert(
-      Kokkos::Impl::ReduceFunctorHasJoin<
-          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
-      "");
-}
-
-}  // namespace
-
-namespace Test {
-TEST(TEST_CATEGORY, template_meta_functions) {
-  TestTemplateMetaFunctions<int, TEST_EXECSPACE>();
-}
-}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestUniqueToken.hpp b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
index 4ba48bf73..224a97fef 100644
--- a/packages/kokkos/core/unit_test/TestUniqueToken.hpp
+++ b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
@@ -42,11 +42,11 @@
 //@HEADER
 */
 
-#include <iostream>
+#include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
 
-namespace Test {
+namespace {
 
 template <class Space, Kokkos::Experimental::UniqueTokenScope Scope>
 class TestUniqueToken {
@@ -152,14 +152,12 @@ class TestUniqueToken {
     }
 #endif
 
-    std::cout << "TestUniqueToken max reuse = " << max << std::endl;
-
     typename view_type::HostMirror host_errors =
         Kokkos::create_mirror_view(self.errors);
 
     Kokkos::deep_copy(host_errors, self.errors);
 
-    ASSERT_EQ(host_errors(0), 0);
+    ASSERT_EQ(host_errors(0), 0) << "max reuse was " << max;
   }
 };
 
@@ -268,22 +266,24 @@ class TestAcquireTeamUniqueToken {
       }
     }
 
-    std::cout << "TestAcquireTeamUniqueToken max reuse = " << max << std::endl;
-
     typename view_type::HostMirror host_errors =
         Kokkos::create_mirror_view(self.errors);
 
     Kokkos::deep_copy(host_errors, self.errors);
 
-    ASSERT_EQ(host_errors(0), 0);
+    ASSERT_EQ(host_errors(0), 0) << "max reuse was " << max;
   }
 };
 
-TEST(TEST_CATEGORY, acquire_team_unique_token) {
-  // FIXME_OPENMPTARGET - Not yet implemented.
-#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
-  TestAcquireTeamUniqueToken<TEST_EXECSPACE>::run();
+TEST(TEST_CATEGORY, unique_token_team_acquire) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  if constexpr (std::is_same<TEST_EXECSPACE,
+                             Kokkos::Experimental::OpenMPTarget>::value) {
+    GTEST_SKIP() << "skipping because OpenMPTarget does not implement yet a "
+                    "specialization of AcquireTeamUniqueToken";
+  } else
 #endif
+    TestAcquireTeamUniqueToken<TEST_EXECSPACE>::run();
 }
 
-}  // namespace Test
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp
index fc8e615da..cbe9b77af 100644
--- a/packages/kokkos/core/unit_test/TestUtilities.hpp
+++ b/packages/kokkos/core/unit_test/TestUtilities.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index 83efae617..320eb6f2e 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
@@ -57,8 +56,14 @@ size_t allocation_count(const Kokkos::View<T, P...> &view) {
   const size_t alloc = view.span();
 
   const int memory_span = Kokkos::View<int *>::required_allocation_size(100);
-
-  return (card <= alloc && memory_span == 400) ? alloc : 0;
+  const int memory_span_layout =
+      Kokkos::View<int *, Kokkos::LayoutRight>::required_allocation_size(
+          Kokkos::LayoutRight(100));
+
+  return ((card <= alloc) && (memory_span == 400) &&
+          (memory_span_layout == 400))
+             ? alloc
+             : 0;
 }
 
 /*--------------------------------------------------------------------------*/
@@ -104,8 +109,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 8> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -200,8 +204,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -278,8 +281,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -354,8 +356,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -442,8 +443,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -512,8 +512,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -605,8 +604,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -681,8 +679,7 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
   using value_type = int;
 
   KOKKOS_INLINE_FUNCTION
-  static void join(volatile value_type &update,
-                   const volatile value_type &input) {
+  static void join(value_type &update, const value_type &input) {
     update |= input;
   }
 
@@ -1023,6 +1020,91 @@ class TestViewAPI {
 #endif
   }
 
+  static void run_test_contruction_from_layout() {
+    using hView0 = typename dView0::HostMirror;
+    using hView1 = typename dView1::HostMirror;
+    using hView2 = typename dView2::HostMirror;
+    using hView3 = typename dView3::HostMirror;
+    using hView4 = typename dView4::HostMirror;
+
+    hView0 hv_0("dView0::HostMirror");
+    hView1 hv_1("dView1::HostMirror", N0);
+    hView2 hv_2("dView2::HostMirror", N0);
+    hView3 hv_3("dView3::HostMirror", N0);
+    hView4 hv_4("dView4::HostMirror", N0);
+
+    dView0 dv_0_1(nullptr, 0);
+    dView0 dv_0_2(hv_0.label(), hv_0.layout());
+
+    dView1 dv_1_1(nullptr, 0);
+    dView1 dv_1_2(hv_1.label(), hv_1.layout());
+
+    dView2 dv_2_1(nullptr, 0);
+    dView2 dv_2_2(hv_2.label(), hv_2.layout());
+
+    dView3 dv_3_1(nullptr, 0);
+    dView3 dv_3_2(hv_3.label(), hv_3.layout());
+
+    dView4 dv_4_1(nullptr, 0);
+    dView4 dv_4_2(hv_4.label(), hv_4.layout());
+  }
+
+  static void run_test_contruction_from_layout_2() {
+    using dView3_0 = Kokkos::View<T ***, device>;
+    using dView3_1 = Kokkos::View<T * * [N1], device>;
+    using dView3_2 = Kokkos::View<T * [N1][N2], device>;
+    using dView3_3 = Kokkos::View<T[N0][N1][N2], device>;
+
+    dView3_0 v_0("v_0", N0, N1, N2);
+    dView3_1 v_1("v_1", N0, N1);
+    dView3_2 v_2("v_2", N0);
+    dView3_3 v_3("v_2");
+
+    dView3_1 v_1_a("v_1", N0, N1, N2);
+    dView3_2 v_2_a("v_2", N0, N1, N2);
+    dView3_3 v_3_a("v_2", N0, N1, N2);
+
+    {
+      dView3_0 dv_1(v_0.label(), v_0.layout());
+      dView3_0 dv_2(v_1.label(), v_1.layout());
+      dView3_0 dv_3(v_2.label(), v_2.layout());
+      dView3_0 dv_4(v_3.label(), v_3.layout());
+      dView3_0 dv_5(v_1_a.label(), v_1_a.layout());
+      dView3_0 dv_6(v_2_a.label(), v_2_a.layout());
+      dView3_0 dv_7(v_3_a.label(), v_3_a.layout());
+    }
+
+    {
+      dView3_1 dv_1(v_0.label(), v_0.layout());
+      dView3_1 dv_2(v_1.label(), v_1.layout());
+      dView3_1 dv_3(v_2.label(), v_2.layout());
+      dView3_1 dv_4(v_3.label(), v_3.layout());
+      dView3_1 dv_5(v_1_a.label(), v_1_a.layout());
+      dView3_1 dv_6(v_2_a.label(), v_2_a.layout());
+      dView3_1 dv_7(v_3_a.label(), v_3_a.layout());
+    }
+
+    {
+      dView3_2 dv_1(v_0.label(), v_0.layout());
+      dView3_2 dv_2(v_1.label(), v_1.layout());
+      dView3_2 dv_3(v_2.label(), v_2.layout());
+      dView3_2 dv_4(v_3.label(), v_3.layout());
+      dView3_2 dv_5(v_1_a.label(), v_1_a.layout());
+      dView3_2 dv_6(v_2_a.label(), v_2_a.layout());
+      dView3_2 dv_7(v_3_a.label(), v_3_a.layout());
+    }
+
+    {
+      dView3_3 dv_1(v_0.label(), v_0.layout());
+      dView3_3 dv_2(v_1.label(), v_1.layout());
+      dView3_3 dv_3(v_2.label(), v_2.layout());
+      dView3_3 dv_4(v_3.label(), v_3.layout());
+      dView3_3 dv_5(v_1_a.label(), v_1_a.layout());
+      dView3_3 dv_6(v_2_a.label(), v_2_a.layout());
+      dView3_3 dv_7(v_3_a.label(), v_3_a.layout());
+    }
+  }
+
   static void run_test() {
     // mfh 14 Feb 2014: This test doesn't actually create instances of
     // these types.  In order to avoid "unused type alias"
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_b.hpp b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp
index ad9069e39..5cd9446a1 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_b.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp
@@ -50,6 +50,8 @@ TEST(TEST_CATEGORY, view_api_b) {
   TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_a();
   TestViewAPI<double, TEST_EXECSPACE>::run_test_mirror();
   TestViewAPI<double, TEST_EXECSPACE>::run_test_scalar();
+  TestViewAPI<double, TEST_EXECSPACE>::run_test_contruction_from_layout();
+  TestViewAPI<double, TEST_EXECSPACE>::run_test_contruction_from_layout_2();
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index d1d38022a..2dfde0857 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
@@ -269,6 +268,43 @@ TEST(TEST_CATEGORY, view_allocation_large_rank) {
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, v_single);
   ASSERT_EQ(result(0, 0, 0, 0, 0, 0, 0, 0), 42);
 }
+
+template <typename ExecSpace, typename ViewType>
+struct TestViewShmemSizeOnDevice {
+  using ViewTestType = Kokkos::View<size_t, ExecSpace>;
+
+  TestViewShmemSizeOnDevice(size_t d1_, size_t d2_, size_t d3_)
+      : d1(d1_), d2(d2_), d3(d3_), shmemSize("shmemSize") {}
+
+  KOKKOS_FUNCTION void operator()(const int&) const {
+    auto shmem  = ViewType::shmem_size(d1, d2, d3);
+    shmemSize() = shmem;
+  }
+
+  size_t d1, d2, d3;
+  ViewTestType shmemSize;
+};
+
+TEST(TEST_CATEGORY, view_shmem_size_on_device) {
+  using ExecSpace = typename TEST_EXECSPACE::execution_space;
+  using ViewType  = Kokkos::View<int64_t***, ExecSpace>;
+
+  constexpr size_t d1 = 5;
+  constexpr size_t d2 = 7;
+  constexpr size_t d3 = 11;
+
+  TestViewShmemSizeOnDevice<ExecSpace, ViewType> testShmemSize(d1, d2, d3);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), testShmemSize);
+
+  auto size = ViewType::shmem_size(d1, d2, d3);
+
+  auto shmemSizeHost = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), testShmemSize.shmemSize);
+
+  ASSERT_EQ(size, shmemSizeHost());
+}
+
 }  // namespace Test
 
 #include <TestViewIsAssignable.hpp>
diff --git a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
new file mode 100644
index 000000000..e1eb88dba
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp
@@ -0,0 +1,430 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 3.0
+// Copyright (2020) National Technology & Engineering
+// Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR)
+#define DIE(EXPR, ARGS, DYNRANK)                                           \
+  ASSERT_DEATH(                                                            \
+      EXPR,                                                                \
+      "Constructor for Kokkos View 'v_" #ARGS                              \
+      "' has mismatched number of arguments. Number of arguments = " #ARGS \
+      " but dynamic rank = " #DYNRANK)
+
+#define PARAM_0
+#define PARAM_1 1
+#define PARAM_2 1, 1
+#define PARAM_3 1, 1, 1
+#define PARAM_4 1, 1, 1, 1
+#define PARAM_5 1, 1, 1, 1, 1
+#define PARAM_6 1, 1, 1, 1, 1, 1
+#define PARAM_7 1, 1, 1, 1, 1, 1, 1
+
+#define PARAM_0_RANK 0
+#define PARAM_1_RANK 1
+#define PARAM_2_RANK 2
+#define PARAM_3_RANK 3
+#define PARAM_4_RANK 4
+#define PARAM_5_RANK 5
+#define PARAM_6_RANK 6
+#define PARAM_7_RANK 7
+
+using DType = int;
+
+// Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until
+// Kokkos::abort() aborts properly on that backend
+// Skip test execution when KOKKOS_COMPILER_NVHPC until fixed in GTEST
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) || (KOKKOS_COMPILER_NVHPC)
+#else
+TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using DType_0 = DType;
+  using DType_1 = DType *;
+  using DType_2 = DType **;
+  using DType_3 = DType ***;
+  using DType_4 = DType ****;
+  using DType_5 = DType *****;
+  using DType_6 = DType ******;
+  using DType_7 = DType *******;
+  {
+    // test View parameters for View dim = 0, dynamic = 0
+    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 1, dynamic = 1
+    DIE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 1);
+    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 1);
+    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 1);
+    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 1);
+    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 1);
+    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 1);
+    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 1);
+    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 1);
+  }
+
+  {
+    // test View parameters for View dim = 2, dynamic = 2
+    DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 2);
+    DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 2);
+    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 2);
+    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 2);
+    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 2);
+    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 2);
+    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 2);
+    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 2);
+  }
+
+  {
+    // test View parameters for View dim = 3, dynamic = 3
+    DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 3);
+    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 3);
+    DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 3);
+    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 3);
+    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 3);
+    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 3);
+    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 3);
+    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 3);
+  }
+
+  {
+    // test View parameters for View dim = 4, dynamic = 4
+    DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 4);
+    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 4);
+    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 4);
+    DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 4);
+    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 4);
+    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 4);
+    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 4);
+    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 4);
+  }
+
+  {
+    // test View parameters for View dim = 5, dynamic = 5
+    DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 5);
+    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 5);
+    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 5);
+    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 5);
+    DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 5);
+    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 5);
+    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 5);
+    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 5);
+  }
+
+  {
+    // test View parameters for View dim = 6, dynamic = 6
+    DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 6);
+    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 6);
+    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 6);
+    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 6);
+    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 6);
+    DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 6);
+    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 6);
+    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 6);
+  }
+
+  {
+    // test View parameters for View dim = 7, dynamic = 7
+    DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 7);
+    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 7);
+    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 7);
+    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 7);
+    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 7);
+    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 7);
+    DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 7);
+    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 7);
+  }
+}
+
+TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using DType_0 = DType;
+  using DType_1 = DType[1];
+  using DType_2 = DType[1][1];
+  using DType_3 = DType[1][1][1];
+  using DType_4 = DType[1][1][1][1];
+  using DType_5 = DType[1][1][1][1][1];
+  using DType_6 = DType[1][1][1][1][1][1];
+  using DType_7 = DType[1][1][1][1][1][1][1];
+  {
+    // test View parameters for View dim = 0, dynamic = 0
+    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 1, dynamic = 0
+    LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0);
+    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 2, dynamic = 0
+    LIVE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 0);
+    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 3, dynamic = 0
+    LIVE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 0);
+    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 4, dynamic = 0
+    LIVE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 0);
+    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 5, dynamic = 0
+    LIVE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 0);
+    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 6, dynamic = 0
+    LIVE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 0);
+    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 7, dynamic = 0
+    LIVE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 0);
+    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+}
+
+TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using DType_0 = DType;
+  using DType_1 = DType[1];
+  using DType_2 = DType * [1];
+  using DType_3 = DType * * [1];
+  using DType_4 = DType ** * [1];
+  using DType_5 = DType *** * [1];
+  using DType_6 = DType **** * [1];
+  using DType_7 = DType ***** * [1];
+  {
+    // test View parameters for View dim = 0, dynamic = 0
+    LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0);
+    DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 1, dynamic = 0
+    LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0);
+    LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0);
+    DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0);
+    DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0);
+    DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0);
+    DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0);
+    DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0);
+    DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0);
+  }
+
+  {
+    // test View parameters for View dim = 2, dynamic = 1
+    DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 1);
+    LIVE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 1);
+    LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 1);
+    DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 1);
+    DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 1);
+    DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 1);
+    DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 1);
+    DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 1);
+  }
+
+  {
+    // test View parameters for View dim = 3, dynamic = 2
+    DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 2);
+    DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 2);
+    LIVE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 2);
+    LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 2);
+    DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 2);
+    DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 2);
+    DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 2);
+    DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 2);
+  }
+
+  {
+    // test View parameters for View dim = 4, dynamic = 3
+    DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 3);
+    DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 3);
+    DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 3);
+    LIVE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 3);
+    LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 3);
+    DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 3);
+    DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 3);
+    DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 3);
+  }
+
+  {
+    // test View parameters for View dim = 5, dynamic = 4
+    DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 4);
+    DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 4);
+    DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 4);
+    DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 4);
+    LIVE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 4);
+    LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 4);
+    DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 4);
+    DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 4);
+  }
+
+  {
+    // test View parameters for View dim = 6, dynamic = 5
+    DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 5);
+    DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 5);
+    DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 5);
+    DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 5);
+    DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 5);
+    LIVE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 5);
+    LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 5);
+    DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 5);
+  }
+
+  {
+    // test View parameters for View dim = 7, dynamic = 6
+    DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 6);
+    DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 6);
+    DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 6);
+    DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 6);
+    DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 6);
+    DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 6);
+    LIVE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 6);
+    LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 6);
+  }
+}
+#endif  // KOKKOS_ENABLE_OPENMPTARGET
+
+#undef PARAM_0
+#undef PARAM_1
+#undef PARAM_2
+#undef PARAM_3
+#undef PARAM_4
+#undef PARAM_5
+#undef PARAM_6
+#undef PARAM_7
+
+#undef PARAM_0_RANK
+#undef PARAM_1_RANK
+#undef PARAM_2_RANK
+#undef PARAM_3_RANK
+#undef PARAM_4_RANK
+#undef PARAM_5_RANK
+#undef PARAM_6_RANK
+#undef PARAM_7_RANK
+
+#undef DType
+
+#undef LIVE
+#undef DIE
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewHooks.hpp b/packages/kokkos/core/unit_test/TestViewHooks.hpp
new file mode 100644
index 000000000..ac679bedb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewHooks.hpp
@@ -0,0 +1,159 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef TESTVIEWHOOKS_HPP_
+#define TESTVIEWHOOKS_HPP_
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+template <class DeviceType>
+struct TestViewHooks {
+  struct TestSubscriber;
+
+  static_assert(
+      Kokkos::Experimental::is_hooks_policy<
+          Kokkos::Experimental::SubscribableViewHooks<TestSubscriber> >::value,
+      "Must be a hooks policy");
+
+  using test_view_type =
+      Kokkos::View<double **,
+                   Kokkos::Experimental::SubscribableViewHooks<TestSubscriber>,
+                   DeviceType>;
+
+  struct TestSubscriber {
+    static test_view_type *self_ptr;
+    static const test_view_type *other_ptr;
+
+    template <typename View>
+    static void copy_constructed(View &self, const View &other) {
+      self_ptr  = &self;
+      other_ptr = &other;
+    }
+
+    template <typename View>
+    static void move_constructed(View &self, const View &other) {
+      self_ptr  = &self;
+      other_ptr = &other;
+    }
+
+    template <typename View>
+    static void copy_assigned(View &self, const View &other) {
+      self_ptr  = &self;
+      other_ptr = &other;
+    }
+
+    template <typename View>
+    static void move_assigned(View &self, const View &other) {
+      self_ptr  = &self;
+      other_ptr = &other;
+    }
+
+    static void reset() {
+      self_ptr  = nullptr;
+      other_ptr = nullptr;
+    }
+  };
+
+  static void testViewHooksCopyConstruct() {
+    TestSubscriber::reset();
+    test_view_type testa;
+
+    test_view_type testb(testa);
+    EXPECT_EQ(TestSubscriber::self_ptr, &testb);
+    EXPECT_EQ(TestSubscriber::other_ptr, &testa);
+  }
+
+  static void testViewHooksMoveConstruct() {
+    TestSubscriber::reset();
+    test_view_type testa;
+
+    test_view_type testb(std::move(testa));
+    EXPECT_EQ(TestSubscriber::self_ptr, &testb);
+
+    // This is valid, even if the view is moved-from
+    EXPECT_EQ(TestSubscriber::other_ptr, &testa);
+  }
+
+  static void testViewHooksCopyAssign() {
+    TestSubscriber::reset();
+    test_view_type testa;
+
+    test_view_type testb;
+    testb = testa;
+    EXPECT_EQ(TestSubscriber::self_ptr, &testb);
+    EXPECT_EQ(TestSubscriber::other_ptr, &testa);
+  }
+
+  static void testViewHooksMoveAssign() {
+    TestSubscriber::reset();
+    test_view_type testa;
+
+    test_view_type testb;
+    testb = std::move(testa);
+    EXPECT_EQ(TestSubscriber::self_ptr, &testb);
+
+    // This is valid, even if the view is moved-from
+    EXPECT_EQ(TestSubscriber::other_ptr, &testa);
+  }
+};
+
+template <class DeviceType>
+typename TestViewHooks<DeviceType>::test_view_type
+    *TestViewHooks<DeviceType>::TestSubscriber::self_ptr = nullptr;
+
+template <class DeviceType>
+const typename TestViewHooks<DeviceType>::test_view_type
+    *TestViewHooks<DeviceType>::TestSubscriber::other_ptr = nullptr;
+
+TEST(TEST_CATEGORY, view_hooks) {
+  using ExecSpace = TEST_EXECSPACE;
+  TestViewHooks<ExecSpace>::testViewHooksCopyConstruct();
+  TestViewHooks<ExecSpace>::testViewHooksMoveConstruct();
+  TestViewHooks<ExecSpace>::testViewHooksCopyAssign();
+  TestViewHooks<ExecSpace>::testViewHooksMoveAssign();
+}
+
+}  // namespace Test
+#endif  // TESTVIEWHOOKS_HPP_
diff --git a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
index fcf9f75f3..03c3b977e 100644
--- a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
+++ b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
@@ -12,14 +12,14 @@ struct TestAssignability {
   template <class MappingType>
   static void try_assign(
       ViewTypeDst& dst, ViewTypeSrc& src,
-      typename std::enable_if<MappingType::is_assignable>::type* = nullptr) {
+      std::enable_if_t<MappingType::is_assignable>* = nullptr) {
     dst = src;
   }
 
   template <class MappingType>
   static void try_assign(
       ViewTypeDst&, ViewTypeSrc&,
-      typename std::enable_if<!MappingType::is_assignable>::type* = nullptr) {
+      std::enable_if_t<!MappingType::is_assignable>* = nullptr) {
     Kokkos::Impl::throw_runtime_exception(
         "TestAssignability::try_assign: Unexpected call path");
   }
diff --git a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
index d592fcaf2..a8d28933a 100644
--- a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
+++ b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <time.h>
@@ -56,10 +55,7 @@ namespace Test {
 TEST(TEST_CATEGORY, view_layoutstride_left_to_layoutleft_assignment) {
   using exec_space = TEST_EXECSPACE;
 
-  auto t = time(nullptr);
-  srand(t);  // Use current time as seed for random generator
-  printf("view_layoutstride_left_to_layoutleft_assignment: srand(%lu)\n",
-         static_cast<unsigned long>(t));
+  srand(123456);  // arbitrary seed for random generator
 
   {  // Assignment of rank-1 LayoutLeft = LayoutStride
     int ndims   = 1;
@@ -338,10 +334,7 @@ TEST(TEST_CATEGORY, view_layoutstride_left_to_layoutleft_assignment) {
 TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
   using exec_space = TEST_EXECSPACE;
 
-  auto t = time(nullptr);
-  srand(t);  // Use current time as seed for random generator
-  printf("view_layoutstride_right_to_layoutright_assignment: srand(%lu)\n",
-         static_cast<unsigned long>(t));
+  srand(123456);  // arbitrary seed for random generator
 
   {  // Assignment of rank-1 LayoutRight = LayoutStride
     int ndims   = 1;
@@ -621,10 +614,7 @@ TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) {
 TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) {
   using exec_space = TEST_EXECSPACE;
 
-  auto t = time(nullptr);
-  srand(t);  // Use current time as seed for random generator
-  printf("view_layoutstride_right_to_layoutleft_assignment: srand(%lu)\n",
-         static_cast<unsigned long>(t));
+  srand(123456);  // arbitrary seed for random generator
 
   {  // Assignment of rank-1 LayoutLeft = LayoutStride (LayoutRight compatible)
     int ndims   = 1;
@@ -776,10 +766,7 @@ TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) {
 TEST(TEST_CATEGORY_DEATH, view_layoutstride_left_to_layoutright_assignment) {
   using exec_space = TEST_EXECSPACE;
 
-  auto t = time(nullptr);
-  srand(t);  // Use current time as seed for random generator
-  printf("view_layoutstride_left_to_layoutright_assignment: srand(%lu)\n",
-         static_cast<unsigned long>(t));
+  srand(123456);  // arbitrary seed for random generator
 
   {  // Assignment of rank-1 LayoutRight = LayoutStride (LayoutLeft compatible)
     int ndims   = 1;
diff --git a/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp b/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp
index 2510a1244..b0fabddbf 100644
--- a/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp
+++ b/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp
@@ -42,6 +42,10 @@
 //@HEADER
 */
 
+#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+#endif
+
 #include <cstdio>
 
 #include <gtest/gtest.h>
@@ -1776,3 +1780,5 @@ TEST(TEST_CATEGORY, view_layouttiled_subtile) {
       4, 12, 16, 12);
 }
 }  // namespace Test
+
+#undef KOKKOS_IMPL_PUBLIC_INCLUDE
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
index edeb1abda..5a54a9361 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <cstddef>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
@@ -249,11 +248,11 @@ void test_view_mapping() {
     ASSERT_EQ(layout.dimension[0], 2u);
     ASSERT_EQ(layout.dimension[1], 3u);
     ASSERT_EQ(layout.dimension[2], 4u);
-    ASSERT_EQ(layout.dimension[3], 1u);
-    ASSERT_EQ(layout.dimension[4], 1u);
-    ASSERT_EQ(layout.dimension[5], 1u);
-    ASSERT_EQ(layout.dimension[6], 1u);
-    ASSERT_EQ(layout.dimension[7], 1u);
+    ASSERT_EQ(layout.dimension[3], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(layout.dimension[4], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(layout.dimension[5], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(layout.dimension[6], KOKKOS_INVALID_INDEX);
+    ASSERT_EQ(layout.dimension[7], KOKKOS_INVALID_INDEX);
 
     ASSERT_EQ(stride3.m_dim.rank, 3u);
     ASSERT_EQ(stride3.m_dim.N0, 2u);
@@ -447,8 +446,8 @@ void test_view_mapping() {
     Kokkos::Impl::ViewDimension<N0, N1, N2, N3> dim;
 
     SubviewExtents tmp(dim, N0 / 2, Kokkos::ALL,
-                       std::pair<int, int>(N2 / 4, 10 + N2 / 4),
-                       Kokkos::pair<int, int>(N3 / 4, 20 + N3 / 4));
+                       std::pair<size_t, size_t>(N2 / 4, 10 + N2 / 4),
+                       Kokkos::pair<size_t, size_t>(N3 / 4, 20 + N3 / 4));
 
     ASSERT_EQ(tmp.domain_offset(0), N0 / 2);
     ASSERT_EQ(tmp.domain_offset(1), 0u);
@@ -632,8 +631,7 @@ void test_view_mapping() {
 
     using a_const_int_r1 = ViewDataAnalysis<const int[], void>;
 
-    static_assert(
-        std::is_same<typename a_const_int_r1::specialize, void>::value, "");
+    static_assert(std::is_void<typename a_const_int_r1::specialize>::value, "");
     static_assert(std::is_same<typename a_const_int_r1::dimension,
                                Kokkos::Impl::ViewDimension<0> >::value,
                   "");
@@ -664,8 +662,7 @@ void test_view_mapping() {
 
     using a_const_int_r3 = ViewDataAnalysis<const int* * [4], void>;
 
-    static_assert(
-        std::is_same<typename a_const_int_r3::specialize, void>::value, "");
+    static_assert(std::is_void<typename a_const_int_r3::specialize>::value, "");
 
     static_assert(std::is_same<typename a_const_int_r3::dimension,
                                Kokkos::Impl::ViewDimension<0, 0, 4> >::value,
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp
index 3e6d91c0b..b6f83e2f2 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
index e52362a05..0dd6a8d52 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -44,7 +44,6 @@
 
 #include <gtest/gtest.h>
 
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
new file mode 100644
index 000000000..aeab9dbcb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp
@@ -0,0 +1,221 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+#ifndef KOKKOS_COMPILER_NVHPC  // FIXME_NVHPC
+template <class View, class ExecutionSpace>
+struct TestViewMemoryAccessViolation {
+  View v;
+  static constexpr auto rank = View::rank;
+
+  template <std::size_t... Is>
+  KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence<Is...>) const {
+    return v((Is * 0)...);
+  }
+
+  KOKKOS_FUNCTION void operator()(int) const {
+    ++bad_access(std::make_index_sequence<rank>{});
+  }
+
+  TestViewMemoryAccessViolation(View w, ExecutionSpace const& s,
+                                std::string const& matcher)
+      : v(std::move(w)) {
+    constexpr bool view_accessible_from_execution_space =
+        Kokkos::SpaceAccessibility<
+            /*AccessSpace=*/ExecutionSpace,
+            /*MemorySpace=*/typename View::memory_space>::accessible;
+    EXPECT_FALSE(view_accessible_from_execution_space);
+    EXPECT_DEATH(
+        {
+          Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(s, 0, 1),
+                               *this);
+          Kokkos::fence();
+        },
+        matcher);
+  }
+};
+
+template <class View, class ExecutionSpace>
+void test_view_memory_access_violation(View v, ExecutionSpace const& s,
+                                       std::string const& m) {
+  TestViewMemoryAccessViolation<View, ExecutionSpace>(std::move(v), s, m);
+}
+
+template <class View, class LblOrPtr, std::size_t... Is>
+auto make_view_impl(LblOrPtr x, std::index_sequence<Is...>) {
+  return View(x, (Is + 1)...);
+}
+
+template <class View, class LblOrPtr>
+auto make_view(LblOrPtr x) {
+  return make_view_impl<View>(std::move(x),
+                              std::make_index_sequence<View::rank>());
+}
+
+template <class ExecutionSpace>
+void test_view_memory_access_violations_from_host() {
+  Kokkos::DefaultHostExecutionSpace const host_exec_space{};
+  // clang-format off
+  using V0 = Kokkos::View<int,         ExecutionSpace>;
+  using V1 = Kokkos::View<int*,        ExecutionSpace>;
+  using V2 = Kokkos::View<int**,       ExecutionSpace>;
+  using V3 = Kokkos::View<int***,      ExecutionSpace>;
+  using V4 = Kokkos::View<int****,     ExecutionSpace>;
+  using V5 = Kokkos::View<int*****,    ExecutionSpace>;
+  using V6 = Kokkos::View<int******,   ExecutionSpace>;
+  using V7 = Kokkos::View<int*******,  ExecutionSpace>;
+  using V8 = Kokkos::View<int********, ExecutionSpace>;
+  std::string const prefix = "Kokkos::View ERROR: attempt to access inaccessible memory space";
+  std::string const lbl = "my_label";
+  test_view_memory_access_violation(make_view<V0>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V1>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V2>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V3>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V4>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V5>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V6>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V7>(lbl), host_exec_space, prefix + ".*" + lbl);
+  test_view_memory_access_violation(make_view<V8>(lbl), host_exec_space, prefix + ".*" + lbl);
+  int* const ptr = nullptr;
+  test_view_memory_access_violation(make_view<V0>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V1>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V2>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V3>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V4>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V5>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V6>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V7>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  test_view_memory_access_violation(make_view<V8>(ptr), host_exec_space, prefix + ".*UNMANAGED");
+  // clang-format on
+}
+
+template <class ExecutionSpace>
+void test_view_memory_access_violations_from_device() {
+  ExecutionSpace const exec_space{};
+  // clang-format off
+  using V0 = Kokkos::View<int,         Kokkos::HostSpace>;
+  using V1 = Kokkos::View<int*,        Kokkos::HostSpace>;
+  using V2 = Kokkos::View<int**,       Kokkos::HostSpace>;
+  using V3 = Kokkos::View<int***,      Kokkos::HostSpace>;
+  using V4 = Kokkos::View<int****,     Kokkos::HostSpace>;
+  using V5 = Kokkos::View<int*****,    Kokkos::HostSpace>;
+  using V6 = Kokkos::View<int******,   Kokkos::HostSpace>;
+  using V7 = Kokkos::View<int*******,  Kokkos::HostSpace>;
+  using V8 = Kokkos::View<int********, Kokkos::HostSpace>;
+  std::string const prefix = "Kokkos::View ERROR: attempt to access inaccessible memory space";
+  std::string const lbl = "my_label";
+  test_view_memory_access_violation(make_view<V0>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V1>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V2>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V3>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V4>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V5>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V6>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V7>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V8>(lbl), exec_space, prefix + ".*UNAVAILABLE");
+  int* const ptr = nullptr;
+  test_view_memory_access_violation(make_view<V0>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V1>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V2>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V3>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V4>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V5>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V6>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V7>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  test_view_memory_access_violation(make_view<V8>(ptr), exec_space, prefix + ".*UNAVAILABLE");
+  // clang-format on
+}
+
+// FIXME_SYCL
+#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL))
+TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_host) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  if (Kokkos::SpaceAccessibility<
+          /*AccessSpace=*/Kokkos::HostSpace,
+          /*MemorySpace=*/typename ExecutionSpace::memory_space>::accessible) {
+    GTEST_SKIP() << "skipping since no memory access violation would occur";
+  }
+
+  test_view_memory_access_violations_from_host<ExecutionSpace>();
+}
+#endif
+
+TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_device) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using ExecutionSpace = TEST_EXECSPACE;
+
+  if (Kokkos::SpaceAccessibility<
+          /*AccessSpace=*/ExecutionSpace,
+          /*MemorySpace=*/Kokkos::HostSpace>::accessible) {
+    GTEST_SKIP() << "skipping since no memory access violation would occur";
+  }
+
+#if defined(KOKKOS_IMPL_HIP_ABORT_DOES_NOT_PRINT_MESSAGE)
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value) {
+    GTEST_SKIP() << "skipping because not yet supported with HIP toolchain";
+  }
+#endif
+#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG)  // FIXME_SYCL
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) {
+    GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG "
+                    "is defined";
+  }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // FIXME_OPENMPTARGET
+  if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenMPTarget>::value) {
+    GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not "
+                    "able to abort from the device";
+  }
+#endif
+
+  test_view_memory_access_violations_from_device<ExecutionSpace>();
+}
+#endif
diff --git a/packages/kokkos/core/unit_test/TestViewOfClass.hpp b/packages/kokkos/core/unit_test/TestViewOfClass.hpp
index 634f1da73..e9128debf 100644
--- a/packages/kokkos/core/unit_test/TestViewOfClass.hpp
+++ b/packages/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp
index d1dfb7c51..b7c50d636 100644
--- a/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp
@@ -45,7 +45,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
index 43bbb4320..3114d41be 100644
--- a/packages/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -46,7 +46,6 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
-#include <stdexcept>
 #include <sstream>
 #include <iostream>
 #include <type_traits>
@@ -68,14 +67,14 @@ struct static_assert_predicate_true_impl;
 
 template <template <class...> class predicate, class... message, class... args>
 struct static_assert_predicate_true_impl<
-    typename std::enable_if<predicate<args...>::type::value>::type, predicate,
+    std::enable_if_t<predicate<args...>::type::value>, predicate,
     static_predicate_message<message...>, args...> {
   using type = int;
 };
 
 template <template <class...> class predicate, class... message, class... args>
 struct static_assert_predicate_true_impl<
-    typename std::enable_if<!predicate<args...>::type::value>::type, predicate,
+    std::enable_if_t<!predicate<args...>::type::value>, predicate,
     static_predicate_message<message...>, args...> {
   using type = typename _kokkos____________________static_test_failure_____<
       message...>::type;
diff --git a/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp b/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp
index 8a58888c7..2bf93ab3f 100644
--- a/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp
+++ b/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp
@@ -55,6 +55,8 @@ TEST(TEST_CATEGORY, resize_realloc_no_init) {
       [&]() {
         Kokkos::resize(Kokkos::WithoutInitializing, bla, 5, 6, 7, 9);
         Kokkos::realloc(Kokkos::WithoutInitializing, bla, 8, 8, 8, 8);
+        Kokkos::realloc(Kokkos::view_alloc(Kokkos::WithoutInitializing), bla, 5,
+                        6, 7, 8);
       },
       [&](BeginParallelForEvent event) {
         if (event.descriptor().find("initialization") != std::string::npos)
@@ -96,3 +98,266 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc) {
   ASSERT_TRUE(success);
   listen_tool_events(Config::DisableAll());
 }
+
+TEST(TEST_CATEGORY, realloc_exec_space) {
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<typename TEST_EXECSPACE::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences";
+#endif
+// FIXME_OPENMPTARGET The OpenMPTarget backend doesn't implement allocate taking
+// an execution space instance properly so it needs another fence
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since the OpenMPTarget backend doesn't implement "
+                    "allocate taking an execution space instance properly";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::View<int*, TEST_EXECSPACE>;
+  view_type outer_view, outer_view2;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+        Kokkos::realloc(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, TEST_EXECSPACE{}),
+            inner_view, 10);
+        outer_view2 = inner_view;
+        Kokkos::realloc(Kokkos::view_alloc(TEST_EXECSPACE{}), inner_view, 10);
+      },
+      [&](BeginFenceEvent event) {
+        if ((event.descriptor().find("Debug Only Check for Execution Error") !=
+             std::string::npos) ||
+            (event.descriptor().find("HostSpace fence") != std::string::npos))
+          return MatchDiagnostic{false};
+        return MatchDiagnostic{true, {"Found fence event!"}};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+namespace {
+struct NonTriviallyCopyable {
+  KOKKOS_FUNCTION NonTriviallyCopyable() {}
+  KOKKOS_FUNCTION NonTriviallyCopyable(const NonTriviallyCopyable&) {}
+};
+}  // namespace
+
+TEST(TEST_CATEGORY, view_alloc) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::View<NonTriviallyCopyable*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_existence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc("bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        return MatchDiagnostic{
+            event.descriptor().find(
+                "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") !=
+            std::string::npos};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, view_alloc_exec_space) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::View<NonTriviallyCopyable*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        return MatchDiagnostic{
+            event.descriptor().find(
+                "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") !=
+            std::string::npos};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, view_alloc_int) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::View<int*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_existence(
+      [&]() {
+        view_type inner_view("bla", 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        return MatchDiagnostic{
+            event.descriptor().find(
+                "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") !=
+            std::string::npos};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, view_alloc_exec_space_int) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences());
+  using view_type = Kokkos::View<int*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(TEST_EXECSPACE{}, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        return MatchDiagnostic{
+            event.descriptor().find(
+                "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") !=
+            std::string::npos};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, deep_copy_zero_memset) {
+// FIXME_OPENMPTARGET The OpenMPTarget backend doesn't implement ZeroMemset
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since the OpenMPTarget backend doesn't implement "
+                    "ZeroMemset";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::View<int*, TEST_EXECSPACE> bla("bla", 8);
+
+  auto success =
+      validate_absence([&]() { Kokkos::deep_copy(bla, 0); },
+                       [&](BeginParallelForEvent) {
+                         return MatchDiagnostic{true, {"Found begin event"}};
+                       },
+                       [&](EndParallelForEvent) {
+                         return MatchDiagnostic{true, {"Found end event"}};
+                       });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, resize_exec_space) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableFences(),
+                     Config::EnableKernels());
+  Kokkos::View<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5);
+
+  auto success = validate_absence(
+      [&]() {
+        Kokkos::resize(
+            Kokkos::view_alloc(TEST_EXECSPACE{}, Kokkos::WithoutInitializing),
+            bla, 5, 6, 7, 8);
+      },
+      [&](BeginFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndFenceEvent event) {
+        if (event.descriptor().find("Kokkos::resize(View)") !=
+            std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](BeginParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found begin event"}};
+        return MatchDiagnostic{false};
+      },
+      [&](EndParallelForEvent event) {
+        if (event.descriptor().find("initialization") != std::string::npos)
+          return MatchDiagnostic{true, {"Found end event"}};
+        return MatchDiagnostic{false};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, view_allocation_int) {
+// FIXME_OPENMPTARGET
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since the OpenMPTarget has unexpected fences";
+#endif
+
+  using ExecutionSpace = TEST_EXECSPACE;
+  if (Kokkos::SpaceAccessibility<
+          /*AccessSpace=*/Kokkos::HostSpace,
+          /*MemorySpace=*/ExecutionSpace::memory_space>::accessible) {
+    GTEST_SKIP() << "skipping since the fence checked for isn't necessary";
+  }
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::EnableAll());
+  using view_type = Kokkos::View<int*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_existence(
+      [&]() {
+        view_type inner_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "bla"), 8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent event) {
+        return MatchDiagnostic{
+            event.descriptor().find(
+                "fence after copying header from HostSpace") !=
+            std::string::npos};
+      });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
+
+TEST(TEST_CATEGORY, view_allocation_exec_space_int) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since the OpenMPTarget has unexpected fences";
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<TEST_EXECSPACE::memory_space, Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP()
+        << "skipping since the CudaUVMSpace requires additiional fences";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::EnableAll());
+  using view_type = Kokkos::View<int*, TEST_EXECSPACE>;
+  view_type outer_view;
+
+  auto success = validate_absence(
+      [&]() {
+        view_type inner_view(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                                TEST_EXECSPACE{}, "bla"),
+                             8);
+        // Avoid testing the destructor
+        outer_view = inner_view;
+      },
+      [&](BeginFenceEvent) { return MatchDiagnostic{true}; });
+  ASSERT_TRUE(success);
+  listen_tool_events(Config::DisableAll());
+}
diff --git a/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp
index 12c69926c..dace3924a 100644
--- a/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp
@@ -48,6 +48,7 @@
 #include <gtest/gtest.h>
 
 #define TEST_CATEGORY hip_hostpinned
+#define TEST_CATEGORY_DEATH hip_hostpinned_DeathTest
 #define TEST_EXECSPACE Kokkos::Experimental::HIPHostPinnedSpace
 
 #endif
diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Numeric.hpp b/packages/kokkos/core/unit_test/category_files/TestHIPManaged_Category.hpp
similarity index 82%
rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_Numeric.hpp
rename to packages/kokkos/core/unit_test/category_files/TestHIPManaged_Category.hpp
index 793927e99..1ff88a484 100644
--- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Numeric.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestHIPManaged_Category.hpp
@@ -42,18 +42,13 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_STD_NUMERIC_ALL_HPP
-#define KOKKOS_STD_NUMERIC_ALL_HPP
+#ifndef KOKKOS_TEST_HIPUNIFIED_HPP
+#define KOKKOS_TEST_HIPUNIFIED_HPP
 
-#include "./numeric/Kokkos_AdjacentDifference.hpp"
+#include <gtest/gtest.h>
 
-// contains exclusive_scan, transform_exclusive_scan
-#include "./numeric/Kokkos_ExclusiveScan.hpp"
-
-// contains inclusive_scan, transform_inclusive_scan
-#include "./numeric/Kokkos_InclusiveScan.hpp"
-
-#include "./numeric/Kokkos_Reduce.hpp"
-#include "./numeric/Kokkos_TransformReduce.hpp"
+#define TEST_CATEGORY hip_managed
+#define TEST_CATEGORY_DEATH hip_managed_DeathTest
+#define TEST_EXECSPACE Kokkos::Experimental::HIPManagedSpace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
index 0a9fe5a08..a0a34f5c9 100644
--- a/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp
@@ -49,6 +49,7 @@
 
 #define TEST_CATEGORY hip
 #define TEST_CATEGORY_NUMBER 6
+#define TEST_CATEGORY_DEATH hip_DeathTest
 #define TEST_EXECSPACE Kokkos::Experimental::HIP
 
 #endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
index 0287829fd..4c1996bfa 100644
--- a/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
@@ -48,6 +48,7 @@
 #include <gtest/gtest.h>
 
 #define TEST_CATEGORY sycl_host_usm
+#define TEST_CATEGORY_DEATH sycl_host_usm_DeathTest
 #define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
index 1ec89fc61..442155918 100644
--- a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
@@ -48,6 +48,7 @@
 #include <gtest/gtest.h>
 
 #define TEST_CATEGORY sycl_shared_usm
+#define TEST_CATEGORY_DEATH sycl_shared_usm_DeathTest
 #define TEST_EXECSPACE Kokkos::Experimental::SYCLSharedUSMSpace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
index 345f40d1c..efa84ad39 100644
--- a/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp
@@ -49,6 +49,7 @@
 
 #define TEST_CATEGORY sycl
 #define TEST_CATEGORY_NUMBER 7
+#define TEST_CATEGORY_DEATH sycl_DeathTest
 #define TEST_EXECSPACE Kokkos::Experimental::SYCL
 
 #endif
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index d09d4edfd..31fd63f08 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -61,8 +61,7 @@ __global__ void offset(int* p) {
 TEST(cuda, raw_cuda_interop) {
   int* p;
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
 
   Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100);
   Kokkos::deep_copy(v, 5);
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 13388b4c5..f11f657e0 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -50,8 +50,7 @@ namespace Test {
 TEST(cuda, raw_cuda_streams) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
   int* p;
   cudaMalloc(&p, sizeof(int) * 100);
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ReducerViewSizeLimit.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ReducerViewSizeLimit.cpp
index 3d7498b11..b8ae97f58 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_ReducerViewSizeLimit.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ReducerViewSizeLimit.cpp
@@ -77,14 +77,6 @@ struct ArrayReduceFunctor {
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void join(volatile value_type update,
-                                   const volatile value_type source) const {
-    const int numVecs = value_count;
-    for (int j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION void join(value_type update,
                                    const value_type source) const {
     const int numVecs = value_count;
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
index 46556a201..c0d6a8afe 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp
@@ -67,9 +67,9 @@ struct TestViewAPI<
       Kokkos::MemoryTraits<0>;  // maybe we want to add that later to the matrix
   using view_type =
       Kokkos::View<data_type, layout_type, space_type, traits_type>;
-  using alloc_layout_type = typename std::conditional<
-      std::is_same<layout_type, Kokkos::LayoutStride>::value,
-      Kokkos::LayoutLeft, layout_type>::type;
+  using alloc_layout_type =
+      std::conditional_t<std::is_same<layout_type, Kokkos::LayoutStride>::value,
+                         Kokkos::LayoutLeft, layout_type>;
   using d_alloc_type = Kokkos::View<data_type, alloc_layout_type, space_type>;
   using h_alloc_type = typename Kokkos::View<data_type, alloc_layout_type,
                                              space_type>::HostMirror;
diff --git a/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp b/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp
index d488f0fa3..9f249045b 100644
--- a/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp
+++ b/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp
@@ -4,6 +4,8 @@
 #define KOKKOS_HEADER_TO_TEST \
   KOKKOS_HEADER_TEST_STRINGIZE(KOKKOS_HEADER_TEST_NAME)
 
+#define KOKKOS_IMPL_PUBLIC_INCLUDE
+
 // include header twice to see if the include guards are set correctly
 #include KOKKOS_HEADER_TO_TEST
 #include KOKKOS_HEADER_TO_TEST
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 73d08abca..af20e753d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -61,8 +61,7 @@ __global__ void offset(int* p) {
 TEST(hip, raw_hip_interop) {
   int* p;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
 
   Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100);
   Kokkos::deep_copy(v, 5);
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 69ca62df6..95d102d4d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -52,8 +52,7 @@ namespace Test {
 TEST(hip, raw_hip_streams) {
   hipStream_t stream;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
   int* p;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
new file mode 100644
index 000000000..24f48c659
--- /dev/null
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp
@@ -0,0 +1,86 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestHIP_Category.hpp>
+
+namespace {
+
+template <class HIPMemoryContainer>
+bool checkMemoryCoarseGrainedness(HIPMemoryContainer const& container) {
+  auto size           = container.size();
+  auto allocationSize = HIPMemoryContainer::required_allocation_size(size);
+  hipMemRangeCoherencyMode memInfo;
+
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemRangeGetAttribute(
+      &memInfo, sizeof(hipMemRangeCoherencyMode),
+      hipMemRangeAttributeCoherencyMode, container.data(), allocationSize));
+
+  return (hipMemRangeCoherencyModeCoarseGrain == memInfo);
+}
+
+#define KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(MEMORY_SPACE, DATATYPE, SIZE)    \
+  {                                                                           \
+    Kokkos::View<DATATYPE*, MEMORY_SPACE> view(#MEMORY_SPACE, SIZE);          \
+    ASSERT_TRUE(view.is_allocated())                                          \
+        << "View in " << #MEMORY_SPACE << " with size " << SIZE               \
+        << " was not allocated. This prevents checks of the grainedness.";    \
+    ASSERT_TRUE(checkMemoryCoarseGrainedness(view))                           \
+        << "The memory in views in " << #MEMORY_SPACE                         \
+        << " is not coarse-grained. Kokkos relies on all user facing memory " \
+           "being coarse-grained.";                                           \
+  }
+
+TEST(hip, memory_requirements) {
+  // we want all user-facing memory in hip to be coarse grained. As of
+  // today(07.01.22) the documentation is not reliable/correct, we test the
+  // memory on the device and host
+  KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::Experimental::HIPSpace, int, 10);
+  KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::Experimental::HIPHostPinnedSpace,
+                                       int, 10);
+  KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::Experimental::HIPManagedSpace,
+                                       int, 10);
+}
+}  // namespace
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
index b759d6f40..b44bc46fb 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp
@@ -60,8 +60,11 @@ __global__ void start_intra_block_scan()
   __syncthreads();
 
   DummyFunctor f;
-  Kokkos::Impl::hip_intra_block_reduce_scan<true, DummyFunctor, void>(f,
-                                                                      values);
+  typename Kokkos::Impl::FunctorAnalysis<
+      Kokkos::Impl::FunctorPatternInterface::SCAN,
+      Kokkos::RangePolicy<Kokkos::Experimental::HIP>, DummyFunctor>::Reducer
+      reducer(&f);
+  Kokkos::Impl::hip_intra_block_reduce_scan<true>(reducer, values);
 
   __syncthreads();
   if (values[i] != ((i + 2) * (i + 1)) / 2) {
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index d20ea877e..c9b370ea7 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -75,6 +75,16 @@ TEST(hip, space_access) {
           Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<
+          Kokkos::HostSpace, Kokkos::Experimental::HIPManagedSpace>::assignable,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<
+          Kokkos::HostSpace, Kokkos::Experimental::HIPManagedSpace>::accessible,
+      "");
+
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
@@ -100,6 +110,16 @@ TEST(hip, space_access) {
                                                  Kokkos::HostSpace>::accessible,
                 "");
 
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPSpace,
+                    Kokkos::Experimental::HIPManagedSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPSpace,
+                    Kokkos::Experimental::HIPManagedSpace>::accessible,
+                "");
+
   //--------------------------------------
 
   static_assert(Kokkos::Impl::MemorySpaceAccess<
@@ -127,6 +147,53 @@ TEST(hip, space_access) {
                     Kokkos::Experimental::HIPSpace>::accessible,
                 "");
 
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPHostPinnedSpace,
+                    Kokkos::Experimental::HIPManagedSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPHostPinnedSpace,
+                    Kokkos::Experimental::HIPManagedSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPManagedSpace,
+                    Kokkos::Experimental::HIPManagedSpace>::assignable,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPManagedSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPManagedSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPManagedSpace,
+                    Kokkos::Experimental::HIPSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPManagedSpace,
+                    Kokkos::Experimental::HIPSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPManagedSpace,
+                    Kokkos::Experimental::HIPHostPinnedSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::HIPManagedSpace,
+                    Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
+                "");
+
   //--------------------------------------
 
   static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
@@ -143,6 +210,11 @@ TEST(hip, space_access) {
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::HIP,
+                    Kokkos::Experimental::HIPManagedSpace>::accessible,
+                "");
+
   static_assert(
       !Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                   Kokkos::Experimental::HIPSpace>::accessible,
@@ -153,6 +225,11 @@ TEST(hip, space_access) {
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
+  static_assert(
+      Kokkos::SpaceAccessibility<
+          Kokkos::HostSpace, Kokkos::Experimental::HIPManagedSpace>::accessible,
+      "");
+
   static_assert(
       std::is_same<
           Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space,
@@ -165,6 +242,14 @@ TEST(hip, space_access) {
                    Kokkos::Experimental::HIPHostPinnedSpace>::value,
       "");
 
+  static_assert(
+      std::is_same<
+          Kokkos::Impl::HostMirror<
+              Kokkos::Experimental::HIPManagedSpace>::Space,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::HIPManagedSpace>>::value,
+      "");
+
   static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space,
                     Kokkos::HostSpace>::accessible,
@@ -181,6 +266,12 @@ TEST(hip, space_access) {
                         Kokkos::Experimental::HIPHostPinnedSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::HIPManagedSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
 }
 
 template <class MemSpace, class ExecSpace>
@@ -227,6 +318,11 @@ TEST(hip, impl_view_accessible) {
                         Kokkos::Experimental::HIP>::run();
   TestViewHIPAccessible<Kokkos::Experimental::HIPHostPinnedSpace,
                         Kokkos::HostSpace::execution_space>::run();
+
+  TestViewHIPAccessible<Kokkos::Experimental::HIPManagedSpace,
+                        Kokkos::HostSpace::execution_space>::run();
+  TestViewHIPAccessible<Kokkos::Experimental::HIPManagedSpace,
+                        Kokkos::Experimental::HIP>::run();
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
index 722614464..421e3c50b 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp
@@ -51,9 +51,8 @@
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #ifndef HPX_COMPUTE_DEVICE_CODE
 
-namespace Test {
-
 namespace {
+
 struct FunctorInitConstant {
   Kokkos::View<int *, Kokkos::Experimental::HPX> a;
   int c;
@@ -107,82 +106,75 @@ struct FunctorReduce {
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, int &lsum) const { lsum += a(i); }
 };
-}  // namespace
 
 TEST(hpx, independent_instances) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
-
   const int n = 100;
   const int c = 1;
   const int d = 3;
 
-  {
-    Kokkos::View<int *, Kokkos::Experimental::HPX> v1("v1", n);
-    Kokkos::View<int *, Kokkos::Experimental::HPX> v2("v2", n);
-    Kokkos::View<int *, Kokkos::Experimental::HPX> v3("v3", n);
-    Kokkos::View<int *, Kokkos::Experimental::HPX> v4("v4", n);
-    Kokkos::View<int, Kokkos::Experimental::HPX> sum_v("sum_v");
-
-    Kokkos::Experimental::HPX hpx1(
-        Kokkos::Experimental::HPX::instance_mode::independent);
-    Kokkos::parallel_for(
-        "Test::hpx::independent_instances::init",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx1, 0, n),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        FunctorInitConstant(v1, c));
-
-    Kokkos::Experimental::HPX hpx2(hpx1.impl_get_future());
-    Kokkos::parallel_for(
-        "Test::hpx::independent_instances::add",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx2, 0, n),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        FunctorAdd(v1, v2, d));
-
-    Kokkos::Experimental::HPX hpx3(hpx1.impl_get_future());
-    Kokkos::parallel_for(
-        "Test::hpx::independent_instances::add_index",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx3, 0, n),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        FunctorAddIndex(v1, v3));
-
-    // NOTE: This monstrosity is used to collapse a future<tuple<future<void>,
-    // future<void>>> (return type of when_all) into a future<void> which is
-    // ready whenever the un-collapsed future would've been ready. HPX does not
-    // currently have the functionality to collapse this automatically.
-    Kokkos::Experimental::HPX hpx4(hpx::get<0>(hpx::split_future(
-        hpx::when_all(hpx2.impl_get_future(), hpx3.impl_get_future()))));
-    Kokkos::parallel_for(
-        "Test::hpx::independent_instances::pointwise_sum",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        FunctorPointwiseSum(v2, v3, v4));
-
-    Kokkos::parallel_reduce(
-        "Test::hpx::independent_instances::reduce",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        FunctorReduce(v4), Kokkos::Sum<int>(sum_v));
-
-    hpx4.fence();
-
-    ASSERT_EQ(true, hpx1.impl_get_future().is_ready());
-    ASSERT_EQ(true, hpx2.impl_get_future().is_ready());
-    ASSERT_EQ(true, hpx3.impl_get_future().is_ready());
-    ASSERT_EQ(true, hpx4.impl_get_future().is_ready());
-
-    const int expected_sum = n * (2 * c + d) + (n * (n - 1) / 2);
-    ASSERT_EQ(expected_sum, sum_v());
-  }
-
-  Kokkos::finalize();
+  Kokkos::View<int *, Kokkos::Experimental::HPX> v1("v1", n);
+  Kokkos::View<int *, Kokkos::Experimental::HPX> v2("v2", n);
+  Kokkos::View<int *, Kokkos::Experimental::HPX> v3("v3", n);
+  Kokkos::View<int *, Kokkos::Experimental::HPX> v4("v4", n);
+  Kokkos::View<int, Kokkos::Experimental::HPX> sum_v("sum_v");
+
+  Kokkos::Experimental::HPX hpx1(
+      Kokkos::Experimental::HPX::instance_mode::independent);
+  Kokkos::parallel_for(
+      "Test::hpx::independent_instances::init",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx1, 0, n),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      FunctorInitConstant(v1, c));
+
+  Kokkos::Experimental::HPX hpx2(hpx1.impl_get_future());
+  Kokkos::parallel_for(
+      "Test::hpx::independent_instances::add",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx2, 0, n),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      FunctorAdd(v1, v2, d));
+
+  Kokkos::Experimental::HPX hpx3(hpx1.impl_get_future());
+  Kokkos::parallel_for(
+      "Test::hpx::independent_instances::add_index",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx3, 0, n),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      FunctorAddIndex(v1, v3));
+
+  // NOTE: This monstrosity is used to collapse a future<tuple<future<void>,
+  // future<void>>> (return type of when_all) into a future<void> which is
+  // ready whenever the un-collapsed future would've been ready. HPX does not
+  // currently have the functionality to collapse this automatically.
+  Kokkos::Experimental::HPX hpx4(hpx::get<0>(hpx::split_future(
+      hpx::when_all(hpx2.impl_get_future(), hpx3.impl_get_future()))));
+  Kokkos::parallel_for(
+      "Test::hpx::independent_instances::pointwise_sum",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      FunctorPointwiseSum(v2, v3, v4));
+
+  Kokkos::parallel_reduce(
+      "Test::hpx::independent_instances::reduce",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      FunctorReduce(v4), Kokkos::Sum<int>(sum_v));
+
+  hpx4.fence();
+
+  ASSERT_EQ(true, hpx1.impl_get_future().is_ready());
+  ASSERT_EQ(true, hpx2.impl_get_future().is_ready());
+  ASSERT_EQ(true, hpx3.impl_get_future().is_ready());
+  ASSERT_EQ(true, hpx4.impl_get_future().is_ready());
+
+  const int expected_sum = n * (2 * c + d) + (n * (n - 1) / 2);
+  ASSERT_EQ(expected_sum, sum_v());
 }
-}  // namespace Test
+
+}  // namespace
 
 #endif
 #endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
index ae0d8b5ab..872bf2fd5 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp
@@ -49,36 +49,30 @@
 
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 
-namespace Test {
+namespace {
 
-TEST(hpx, delayed_execution) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+TEST(hpx, independent_instances_delayed_execution) {
+  Kokkos::View<bool, Kokkos::Experimental::HPX> ran("ran");
+  hpx::lcos::local::promise<void> p;
+  hpx::shared_future<void> f = p.get_future();
 
-  {
-    Kokkos::View<bool, Kokkos::Experimental::HPX> ran("ran");
-    hpx::lcos::local::promise<void> p;
-    hpx::shared_future<void> f = p.get_future();
+  Kokkos::Experimental::HPX hpx(f);
+  Kokkos::parallel_for(
+      "Test::hpx::independent_instances::delay_execution",
+      Kokkos::Experimental::require(
+          Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
+          Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+      KOKKOS_LAMBDA(int) { ran() = true; });
 
-    Kokkos::Experimental::HPX hpx(f);
-    Kokkos::parallel_for(
-        "Test::hpx::independent_instances::delay_execution",
-        Kokkos::Experimental::require(
-            Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
-            Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-        KOKKOS_LAMBDA(int) { ran() = true; });
+  ASSERT_FALSE(ran());
+  ASSERT_FALSE(hpx.impl_get_future().is_ready());
 
-    ASSERT_EQ(false, ran());
-    ASSERT_EQ(false, hpx.impl_get_future().is_ready());
+  p.set_value();
 
-    p.set_value();
-
-    hpx.fence();
-    ASSERT_EQ(true, hpx.impl_get_future().is_ready());
-  }
-
-  Kokkos::finalize();
+  hpx.fence();
+  ASSERT_TRUE(hpx.impl_get_future().is_ready());
 }
-}  // namespace Test
+
+}  // namespace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
index 300cb1111..899cd09d4 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp
@@ -49,69 +49,63 @@
 
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 
-namespace Test {
+namespace {
 
-TEST(hpx, instance_ids) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+TEST(hpx, independent_instances_instance_ids) {
+  Kokkos::Experimental::HPX hpx_default1;
+  Kokkos::Experimental::HPX hpx_default2 = hpx_default1;
+  Kokkos::Experimental::HPX hpx_default3{hpx_default1};
+  Kokkos::Experimental::HPX hpx_default4(
+      Kokkos::Experimental::HPX::instance_mode::default_);
+  Kokkos::Experimental::HPX hpx_default5;
+  hpx_default5 = hpx_default1;
 
-  {
-    Kokkos::Experimental::HPX hpx_default1;
-    Kokkos::Experimental::HPX hpx_default2 = hpx_default1;
-    Kokkos::Experimental::HPX hpx_default3{hpx_default1};
-    Kokkos::Experimental::HPX hpx_default4(
-        Kokkos::Experimental::HPX::instance_mode::default_);
-    Kokkos::Experimental::HPX hpx_default5;
-    hpx_default5 = hpx_default1;
+  ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
+            hpx_default1.impl_instance_id());
+  ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
+            hpx_default2.impl_instance_id());
+  ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
+            hpx_default3.impl_instance_id());
+  ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
+            hpx_default4.impl_instance_id());
+  ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
+            hpx_default5.impl_instance_id());
 
-    ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
-              hpx_default1.impl_instance_id());
-    ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
-              hpx_default2.impl_instance_id());
-    ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
-              hpx_default3.impl_instance_id());
-    ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
-              hpx_default4.impl_instance_id());
-    ASSERT_EQ(Kokkos::Experimental::HPX::impl_default_instance_id(),
-              hpx_default5.impl_instance_id());
+  Kokkos::Experimental::HPX hpx_independent1(
+      Kokkos::Experimental::HPX::instance_mode::independent);
+  Kokkos::Experimental::HPX hpx_independent2 = hpx_independent1;
+  Kokkos::Experimental::HPX hpx_independent3{hpx_independent1};
+  Kokkos::Experimental::HPX hpx_independent4;
+  hpx_independent4 = hpx_independent1;
 
-    Kokkos::Experimental::HPX hpx_independent1(
-        Kokkos::Experimental::HPX::instance_mode::independent);
-    Kokkos::Experimental::HPX hpx_independent2 = hpx_independent1;
-    Kokkos::Experimental::HPX hpx_independent3{hpx_independent1};
-    Kokkos::Experimental::HPX hpx_independent4;
-    hpx_independent4 = hpx_independent1;
+  ASSERT_NE(hpx_default1.impl_instance_id(),
+            hpx_independent1.impl_instance_id());
+  ASSERT_EQ(hpx_independent1.impl_instance_id(),
+            hpx_independent2.impl_instance_id());
+  ASSERT_EQ(hpx_independent1.impl_instance_id(),
+            hpx_independent3.impl_instance_id());
+  ASSERT_EQ(hpx_independent1.impl_instance_id(),
+            hpx_independent4.impl_instance_id());
 
-    ASSERT_NE(hpx_default1.impl_instance_id(),
-              hpx_independent1.impl_instance_id());
-    ASSERT_EQ(hpx_independent1.impl_instance_id(),
-              hpx_independent2.impl_instance_id());
-    ASSERT_EQ(hpx_independent1.impl_instance_id(),
-              hpx_independent3.impl_instance_id());
-    ASSERT_EQ(hpx_independent1.impl_instance_id(),
-              hpx_independent4.impl_instance_id());
+  hpx::shared_future<void> f = hpx::make_ready_future<void>();
+  Kokkos::Experimental::HPX hpx_independent_future1(f);
+  Kokkos::Experimental::HPX hpx_independent_future2 = hpx_independent_future1;
+  Kokkos::Experimental::HPX hpx_independent_future3{hpx_independent_future1};
+  Kokkos::Experimental::HPX hpx_independent_future4;
+  hpx_independent_future4 = hpx_independent_future1;
 
-    hpx::shared_future<void> f = hpx::make_ready_future<void>();
-    Kokkos::Experimental::HPX hpx_independent_future1(f);
-    Kokkos::Experimental::HPX hpx_independent_future2 = hpx_independent_future1;
-    Kokkos::Experimental::HPX hpx_independent_future3{hpx_independent_future1};
-    Kokkos::Experimental::HPX hpx_independent_future4;
-    hpx_independent_future4 = hpx_independent_future1;
-
-    ASSERT_NE(hpx_default1.impl_instance_id(),
-              hpx_independent1.impl_instance_id());
-    ASSERT_NE(hpx_independent1.impl_instance_id(),
-              hpx_independent_future1.impl_instance_id());
-    ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-              hpx_independent_future2.impl_instance_id());
-    ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-              hpx_independent_future3.impl_instance_id());
-    ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
-              hpx_independent_future4.impl_instance_id());
-  }
-
-  Kokkos::finalize();
+  ASSERT_NE(hpx_default1.impl_instance_id(),
+            hpx_independent1.impl_instance_id());
+  ASSERT_NE(hpx_independent1.impl_instance_id(),
+            hpx_independent_future1.impl_instance_id());
+  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
+            hpx_independent_future2.impl_instance_id());
+  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
+            hpx_independent_future3.impl_instance_id());
+  ASSERT_EQ(hpx_independent_future1.impl_instance_id(),
+            hpx_independent_future4.impl_instance_id());
 }
-}  // namespace Test
+
+}  // namespace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
index a98c8b0d6..a69bea572 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp
@@ -47,7 +47,6 @@
 
 #ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 
-namespace Test {
 namespace {
 std::atomic<int> dummy_count;
 
@@ -57,39 +56,32 @@ struct dummy {
   ~dummy() { --dummy_count; }
   void f() const {}
 };
-}  // namespace
+
 // This test makes sure the independent HPX instances don't hold on to captured
 // data after destruction.
-TEST(hpx, reference_counting) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
-
-  {
-    dummy d;
-    Kokkos::Experimental::HPX hpx(
-        Kokkos::Experimental::HPX::instance_mode::independent);
-    Kokkos::parallel_for(
-        "Test::hpx::reference_counting::dummy",
-        Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
-        KOKKOS_LAMBDA(int) {
-          // Make sure dummy struct is captured.
-          d.f();
-        });
+TEST(hpx, independent_instances_reference_counting) {
+  dummy d;
+  Kokkos::Experimental::HPX hpx(
+      Kokkos::Experimental::HPX::instance_mode::independent);
+  Kokkos::parallel_for(
+      "Test::hpx::reference_counting::dummy",
+      Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
+      KOKKOS_LAMBDA(int) {
+        // Make sure dummy struct is captured.
+        d.f();
+      });
 
-    // This attaches a continuation and releases the d captured above from the
-    // shared state of the internal future.
-    Kokkos::parallel_for(
-        "Test::hpx::reference_counting::dummy_clear",
-        Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1),
-        KOKKOS_LAMBDA(int){});
+  hpx.fence();
 
-    hpx.fence();
+  // The fence above makes sure that copies of dummy get released. However,
+  // all copies are not guaranteed to be released as soon as fence returns.
+  // Therefore we wait for a short time to make it almost guaranteed that all
+  // copies have been released.
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
-    ASSERT_EQ(1, dummy_count);
-  }
-
-  Kokkos::finalize();
+  ASSERT_EQ(1, dummy_count);
 }
-}  // namespace Test
+
+}  // namespace
 
 #endif
diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
index 31c35ac9a..e89f7acac 100644
--- a/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
+++ b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp
@@ -48,10 +48,10 @@
 namespace Test {
 
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
-// Cuda.
+// HPX.
 TEST(hpx, raw_hpx_interop) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  // FIXME_HPX
+  Kokkos::initialize();
   Kokkos::finalize();
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
index f157af4f2..c14a90d74 100644
--- a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
@@ -61,10 +61,10 @@ template <class ExecSpace>
 struct TestIncrExecSpaceTypedef {
   void testit() {
     const bool passed =
-        (!std::is_same<void, typename ExecSpace::memory_space>::value) &&
+        (!std::is_void<typename ExecSpace::memory_space>::value) &&
         std::is_same<ExecSpace, typename ExecSpace::execution_space>::value &&
-        !std::is_same<void, typename ExecSpace::scratch_memory_space>::value &&
-        !std::is_same<void, typename ExecSpace::array_layout>::value;
+        !std::is_void<typename ExecSpace::scratch_memory_space>::value &&
+        !std::is_void<typename ExecSpace::array_layout>::value;
     static_assert(passed == true,
                   "The memory and execution spaces are defined");
   }
diff --git a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index 7d53b9fb2..649cf6c38 100644
--- a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -71,12 +71,6 @@ struct MyComplex {
     _re += src._re;
     _im += src._im;
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator+=(const volatile MyComplex& src) volatile {
-    _re += src._re;
-    _im += src._im;
-  }
 };
 
 template <class ExecSpace>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
index ce8ee40d4..1039f13fe 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
@@ -74,7 +74,6 @@
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
-#include <TestTemplateMetaFunctions.hpp>
 #include <TestPolicyConstruction.hpp>
 #include <TestMDRange.hpp>
 #include <TestConcurrentBitset.hpp>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
index c3ee67673..5f8fd2236 100644
--- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
@@ -62,8 +62,7 @@ TEST(openmp, raw_openmp_interop) {
 
   ASSERT_EQ(count, num_threads);
 
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
 
   count = 0;
 #pragma omp parallel
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
index edc1c24dd..3d8c722be 100644
--- a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
@@ -76,7 +76,6 @@
 //#include <TestCXX11.hpp>
 //#include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
-//#include <TestTemplateMetaFunctions.hpp>
 //#include <TestPolicyConstruction.hpp>
 //#include <TestMDRange.hpp>
 
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index d145d69d9..e45d99074 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,11 +52,10 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  Kokkos::initialize();
 
   Kokkos::Experimental::SYCL default_space;
-  sycl::context default_context = default_space.sycl_context();
+  sycl::context default_context = default_space.sycl_queue().get_context();
 
   sycl::default_selector device_selector;
   sycl::queue queue(default_context, device_selector);
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
index c12c5c072..114d2a4aa 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
@@ -52,7 +52,7 @@ namespace Test {
 // Test whether external allocations can be accessed by the default queue.
 TEST(sycl, raw_sycl_interop_context_1) {
   Kokkos::Experimental::SYCL default_space;
-  sycl::context default_context = default_space.sycl_context();
+  sycl::context default_context = default_space.sycl_queue().get_context();
 
   sycl::default_selector device_selector;
   sycl::queue queue(default_context, device_selector);
@@ -86,7 +86,7 @@ TEST(sycl, raw_sycl_interop_context_1) {
 // Test whether regular View allocations can be accessed by non-default queues.
 TEST(sycl, raw_sycl_interop_context_2) {
   Kokkos::Experimental::SYCL default_space;
-  sycl::context default_context = default_space.sycl_context();
+  sycl::context default_context = default_space.sycl_queue().get_context();
 
   sycl::default_selector device_selector;
   sycl::queue queue(default_context, device_selector);
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
index 40a88a6ca..8ffada1da 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
@@ -49,12 +49,8 @@ namespace Test {
 // Test Interoperability with SYCL Streams
 TEST(sycl, raw_sycl_queues) {
   sycl::default_selector device_selector;
-  // FIXME_SYCL using an in-order queue here should not be necessary since we
-  // are using submit_barrier for managing kernel dependencies but this seems to
-  // be required as a hot fix for now.
-  sycl::queue queue(device_selector, sycl::property::queue::in_order());
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
+  sycl::queue queue(device_selector);
+  Kokkos::initialize();
   int* p            = sycl::malloc_device<int>(100, queue);
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
index ab0d09880..420522caf 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -102,7 +102,7 @@ void sycl_queue_scratch_test(
     Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) {
   constexpr int K = 4;
   Kokkos::Experimental::SYCL default_space;
-  sycl::context default_context = default_space.sycl_context();
+  sycl::context default_context = default_space.sycl_queue().get_context();
 
   sycl::default_selector device_selector;
   sycl::queue queue(default_context, device_selector);
diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
index df250fe0d..2fd43558f 100644
--- a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
+++ b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp
@@ -189,11 +189,14 @@ TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); }
 TEST(defaultdevicetype, access_allowed) {
   test_allowed_access<fake_memory_space>();
 }
+// FIXME_SYCL
+#if !(defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_SYCL))
 TEST(defaultdevicetype_DeathTest, access_forbidden) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
   ASSERT_DEATH(
       { test_allowed_access<semantically_independent_logical_space>(); },
       "Kokkos::View ERROR: attempt to access inaccessible memory space");
 }
+#endif
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/tools/TestWithoutInitializing.cpp b/packages/kokkos/core/unit_test/tools/TestWithoutInitializing.cpp
index c0a695d72..8d8109894 100644
--- a/packages/kokkos/core/unit_test/tools/TestWithoutInitializing.cpp
+++ b/packages/kokkos/core/unit_test/tools/TestWithoutInitializing.cpp
@@ -75,3 +75,79 @@ TEST(kokkosp, create_mirror_no_init) {
       });
   ASSERT_TRUE(success);
 }
+
+TEST(kokkosp, create_mirror_no_init_view_ctor) {
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
+  Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
+                                                                10);
+  Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::HostSpace{},
+                               Kokkos::WithoutInitializing),
+            device_view);
+        auto mirror_host = Kokkos::create_mirror(
+            Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+        auto mirror_device_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::HostSpace{},
+                               Kokkos::WithoutInitializing),
+            device_view);
+        auto mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing,
+                               Kokkos::DefaultExecutionSpace{}),
+            host_view);
+        mirror_host_view = Kokkos::create_mirror_view(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing), host_view);
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found begin event"}};
+      },
+      [&](EndParallelForEvent) {
+        return MatchDiagnostic{true, {"Found end event"}};
+      });
+  ASSERT_TRUE(success);
+}
+
+TEST(kokkosp, create_mirror_view_and_copy) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET  // FIXME_OPENMPTARGET
+  if (std::is_same<Kokkos::DefaultExecutionSpace,
+                   Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since the OpenMPTarget has unexpected fences";
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<Kokkos::DefaultExecutionSpace::memory_space,
+                   Kokkos::CudaUVMSpace>::value)
+    GTEST_SKIP()
+        << "skipping since the CudaUVMSpace requires additional fences";
+#endif
+
+  using namespace Kokkos::Test::Tools;
+  listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
+                     Config::EnableFences());
+  Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view;
+  Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
+
+  auto success = validate_absence(
+      [&]() {
+        auto mirror_device = Kokkos::create_mirror_view_and_copy(
+            Kokkos::view_alloc(
+                Kokkos::DefaultExecutionSpace{},
+                typename Kokkos::DefaultExecutionSpace::memory_space{}),
+            host_view);
+        // Avoid fences for deallocation when mirror_device goes out of scope.
+        device_view = mirror_device;
+      },
+      [&](BeginParallelForEvent) {
+        return MatchDiagnostic{true, {"Found parallel_for event"}};
+      },
+      [&](BeginFenceEvent) {
+        return MatchDiagnostic{true, {"Found fence event"}};
+      });
+  ASSERT_TRUE(success);
+}
diff --git a/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp b/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp
index e5a03f7fb..10dbea579 100644
--- a/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp
+++ b/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp
@@ -1300,6 +1300,24 @@ bool validate_absence(const Lambda& lam, const Matchers... matchers) {
   return true;
 }
 
+template <class Lambda, class Matcher>
+bool validate_existence(const Lambda& lam, const Matcher matcher) {
+  // First, erase events from previous invocations
+  found_events.clear();
+  // Invoke the lambda (this will populate found_events, via tooling)
+  lam();
+  // compare the found events against the expected ones
+  for (const auto& event : found_events) {
+    MatchDiagnostic match = check_presence_of(event, matcher);
+
+    if (match.success) return true;
+  }
+  std::cout << "Test failure: Didn't encounter wanted events" << std::endl;
+  for (const auto& p_event : found_events)
+    std::cout << p_event->descriptor() << std::endl;
+  return false;
+}
+
 }  // namespace Tools
 }  // namespace Test
 }  // namespace Kokkos
diff --git a/packages/kokkos/example/build_cmake_installed/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt
index 48d2cff51..780f7e6ac 100644
--- a/packages/kokkos/example/build_cmake_installed/CMakeLists.txt
+++ b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt
@@ -6,12 +6,6 @@ cmake_minimum_required(VERSION 3.16)
 # Kokkos flags will only apply to C++ files
 project(Example CXX Fortran)
 
-# You need this for using Kokkos_ROOT variable
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0")
-  message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-  cmake_policy(SET CMP0074 NEW)
-endif()
-
 # Look for an installed Kokkos
 find_package(Kokkos REQUIRED)
 
diff --git a/packages/kokkos/example/build_cmake_installed/cmake_example.cpp b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
index fd05172cb..5101526ab 100644
--- a/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
+++ b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp
@@ -55,7 +55,7 @@ struct CountFunctor {
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
-  Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
+  Kokkos::DefaultExecutionSpace().print_configuration(std::cout);
 
   if (argc < 2) {
     fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]);
diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt
index df16774e7..1647c6ca0 100644
--- a/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt
@@ -6,10 +6,6 @@ cmake_minimum_required(VERSION 3.16)
 # Kokkos flags will only apply to C++ files
 project(Example CXX Fortran)
 
-# You need this for using Kokkos_ROOT variable
-message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-cmake_policy(SET CMP0074 NEW)
-
 # Look for an installed Kokkos but force using the compiler launcher
 # to ensure that targets depending on Kokkos use the same compiler
 # as when kokkos was installed, e.g. if kokkos was built with
diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
index fc10366f7..f78f07c6f 100644
--- a/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
+++ b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp
@@ -53,7 +53,7 @@ struct CountFunctor {
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
-  Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
+  Kokkos::DefaultExecutionSpace().print_configuration(std::cout);
 
   if (argc < 2) {
     fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]);
diff --git a/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp b/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
index 5a0f93e9d..b9b1c5848 100644
--- a/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
+++ b/packages/kokkos/example/build_cmake_installed_kk_as_language/cmake_example.cpp
@@ -56,7 +56,7 @@ struct CountEvenIntegers {
 
 int main(int argc, char* argv[]) {
   Kokkos::ScopeGuard guard(argc, argv);
-  Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
+  Kokkos::DefaultExecutionSpace().print_configuration(std::cout);
 
   const long n = argc > 1 ? atoi(argv[1]) : 10;
 
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
index 5ac7f4fbb..aac3b7eba 100644
--- a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -61,13 +61,13 @@
 
 // Simple functor for computing/storing the product of indices in a View v
 template <class ViewType>
-struct MDFunctor {
+struct MDFunctor2D {
   using value_type = long;
 
   ViewType v;
   size_t size;
 
-  MDFunctor(const ViewType& v_, const size_t size_) : v(v_), size(size_) {}
+  MDFunctor2D(const ViewType& v_, const size_t size_) : v(v_), size(size_) {}
 
   // 2D case - used by parallel_for
   KOKKOS_INLINE_FUNCTION
@@ -75,12 +75,6 @@ struct MDFunctor {
     v(i, j) = i * j;  // compute the product of indices
   }
 
-  // 3D case - used by parallel_for
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int i, const int j, const int k) const {
-    v(i, j, k) = i * j * k;  // compute the product of indices
-  }
-
   // 2D case - reduction
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, const int j, value_type& incorrect_count) const {
@@ -88,6 +82,22 @@ struct MDFunctor {
       incorrect_count += 1;
     }
   }
+};
+
+template <class ViewType>
+struct MDFunctor3D {
+  using value_type = long;
+
+  ViewType v;
+  size_t size;
+
+  MDFunctor3D(const ViewType& v_, const size_t size_) : v(v_), size(size_) {}
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, const int j, const int k) const {
+    v(i, j, k) = i * j * k;  // compute the product of indices
+  }
 
   // 3D case - reduction
   KOKKOS_INLINE_FUNCTION
@@ -170,11 +180,12 @@ int main(int argc, char* argv[]) {
     ViewType_2D v2("v2", n, n);
 
     // Execute parallel_for with rank 2 MDRangePolicy
-    Kokkos::parallel_for("md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n));
+    Kokkos::parallel_for("md2d", mdpolicy_2d, MDFunctor2D<ViewType_2D>(v2, n));
 
     // Check results with a parallel_reduce using the MDRangePolicy
     Kokkos::parallel_reduce("md2dredux", mdpolicy_2d,
-                            MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d);
+                            MDFunctor2D<ViewType_2D>(v2, n),
+                            incorrect_count_2d);
 
     printf("Rank 2 MDRangePolicy incorrect count: %ld\n",
            incorrect_count_2d);  // should be 0
@@ -194,11 +205,12 @@ int main(int argc, char* argv[]) {
     ViewType_3D v3("v3", n, n, n);
 
     // Execute parallel_for with rank 3 MDRangePolicy
-    Kokkos::parallel_for("md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n));
+    Kokkos::parallel_for("md3d", mdpolicy_3d, MDFunctor3D<ViewType_3D>(v3, n));
 
     // Check results with a parallel_reduce using the MDRangePolicy
     Kokkos::parallel_reduce("md3dredux", mdpolicy_3d,
-                            MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d);
+                            MDFunctor3D<ViewType_3D>(v3, n),
+                            incorrect_count_3d);
 
     printf("Rank 3 MDRangePolicy incorrect count: %ld\n",
            incorrect_count_3d);  // should be 0
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index f86147bb9..3b78301fc 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -179,9 +179,18 @@ display_help_text() {
       echo "                 HSW             = Intel Haswell CPUs"
       echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
       echo "                 SKX             = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "                 ICX             = Intel Ice Lake CPUs (AVX512)"
       echo "               [Intel Xeon Phi]"
       echo "                 KNC             = Intel Knights Corner Xeon Phi"
       echo "                 KNL             = Intel Knights Landing Xeon Phi"
+      echo "               [Intel: GPU]"
+      echo "                 INTEL_GEN       = SPIR64-based devices, e.g. Intel GPUs, using JIT"
+      echo "                 INTEL_DG1       = Intel Iris XeMAX GPU"
+      echo "                 INTEL_GEN9      = Intel GPU Gen9"
+      echo "                 INTEL_GEN11     = Intel GPU Gen11"
+      echo "                 INTEL_GEN12LP   = Intel GPU Gen12LP"
+      echo "                 INTEL_XEHP      = Intel GPU Xe-HP"
+      echo "                 INTEL_PVC       = Intel GPU Ponte Vecchio"
       echo "               [NVIDIA]"
       echo "                 Kepler30        = NVIDIA Kepler generation CC 3.0"
       echo "                 Kepler32        = NVIDIA Kepler generation CC 3.2"
@@ -194,11 +203,15 @@ display_help_text() {
       echo "                 Pascal61        = NVIDIA Pascal generation CC 6.1"
       echo "                 Volta70         = NVIDIA Volta generation CC 7.0"
       echo "                 Volta72         = NVIDIA Volta generation CC 7.2"
+      echo "                 Ampere80        = NVIDIA Ampere generation CC 8.0"
+      echo "                 Ampere86        = NVIDIA Ampere generation CC 8.6"
       echo ""
       echo "--compiler=/Path/To/Compiler  Set the compiler."
       echo "--debug,-dbg:                 Enable Debugging."
       echo "--boundscheck:                Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds."
       echo "--disable-tests               Disable compilation of unit tests (enabled by default)"
+      echo "--deprecated-code             Enable deprecated code (disabled by default)"
+      echo "--deprecated-code-warnings    Enable deprecated code warnings (disabled by default)"
       echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
       echo "                                build.  This will still set certain required"
       echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
@@ -239,6 +252,9 @@ WITH_CUDA_BACKEND=OFF
 WITH_HIP_BACKEND=OFF
 WITH_OMPT_BACKEND=OFF
 
+KOKKOS_DEPRECATED_CODE=OFF
+KOKKOS_DEPRECATED_CODE_WARNINGS=OFF
+
 while [[ $# > 0 ]]
 do
   key="$1"
@@ -358,6 +374,12 @@ do
     --disable-tests)
       KOKKOS_DO_TESTS=OFF
       ;;
+    --deprecated-code)
+      KOKKOS_DEPRECATED_CODE=ON
+      ;;
+    --deprecated-code-warnings)
+      KOKKOS_DEPRECATED_CODE_WARNINGS=ON
+      ;;
     --no-examples)
       KOKKOS_DO_EXAMPLES=OFF
       ;;
@@ -483,5 +505,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH}
diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash
index 15a095854..aab95e12e 100755
--- a/packages/kokkos/gnu_generate_makefile.bash
+++ b/packages/kokkos/gnu_generate_makefile.bash
@@ -158,9 +158,18 @@ do
       echo "                 HSW             = Intel Haswell CPUs"
       echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
       echo "                 SKX             = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "                 ICX             = Intel Ice Lake CPUs (AVX512)"
       echo "               [Intel Xeon Phi]"
       echo "                 KNC             = Intel Knights Corner Xeon Phi"
       echo "                 KNL             = Intel Knights Landing Xeon Phi"
+      echo "               [Intel: GPU]"
+      echo "                 INTEL_GEN       = SPIR64-based devices, e.g. Intel GPUs, using JIT"
+      echo "                 INTEL_DG1       = Intel Iris XeMAX GPU"
+      echo "                 INTEL_GEN9      = Intel GPU Gen9"
+      echo "                 INTEL_GEN11     = Intel GPU Gen11"
+      echo "                 INTEL_GEN12LP   = Intel GPU Gen12LP"
+      echo "                 INTEL_XEHP      = Intel GPU Xe-HP"
+      echo "                 INTEL_PVC       = Intel GPU Ponte Vecchio"
       echo "               [NVIDIA]"
       echo "                 Kepler30        = NVIDIA Kepler generation CC 3.0"
       echo "                 Kepler32        = NVIDIA Kepler generation CC 3.2"
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index 41c755a8a..a1a87ce31 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -28,3 +28,4 @@ tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
 tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
 tag:  3.6.00     date: 04:14:2022    master: 2834f94a    release: 6ea708ff
 tag:  3.6.01     date: 06:16:2022    master: b52f8c83    release: afe9b404
+tag:  3.7.00     date: 08:25:2022    master: d19aab99    release: 0018e5fb
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvhpc b/packages/kokkos/scripts/docker/Dockerfile.nvhpc
index 3e3a32e4d..c0b8cc19d 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvhpc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvhpc
@@ -1,4 +1,4 @@
-ARG BASE=nvcr.io/nvidia/nvhpc:21.9-devel-cuda11.4-ubuntu20.04
+ARG BASE=nvcr.io/nvidia/nvhpc:22.3-devel-cuda11.6-ubuntu20.04
 FROM $BASE
 
 RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
@@ -9,7 +9,7 @@ RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
     gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
     rm ${KEYDUMP_FILE}*
 
-ARG CMAKE_VERSION=3.21.4
+ARG CMAKE_VERSION=3.23.1
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
index caeee1821..e10c3f220 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget
+++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
@@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
     rm ${CMAKE_SCRIPT}
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
-ARG LLVM_VERSION=llvmorg-13.0.1-rc3
+ARG LLVM_VERSION=llvmorg-14.0.0
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\
     LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index 1cd700648..0970d2ac5 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -38,8 +38,8 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV SYCL_DIR=/opt/sycl
-RUN SYCL_VERSION=2021-09 && \
-    SYCL_URL=https://github.com/intel/llvm/archive/ && \
+RUN SYCL_VERSION=20220112 && \
+    SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
     SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
     wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \
diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia
index b2c5afe23..72ee31707 100755
--- a/packages/kokkos/scripts/testing_scripts/test_all_sandia
+++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia
@@ -13,6 +13,8 @@ print_help() {
   echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
   echo "    Defaults to root repo containing this script"
   echo "--debug: Run tests in debug. Defaults to False"
+  echo "--deprecated-code: Enable deprecated code (disabled by default)"
+  echo "--deprecated-code-warnings    Enable deprecated code warnings (disabled by default)"
   echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds."
   echo "--test-script: Test this script, not Kokkos"
   echo "--skip-hwloc: Do not do hwloc tests"
@@ -167,6 +169,9 @@ CXX_STANDARD="14"
 
 CTESTTIMEOUT=2000
 
+KOKKOS_DEPRECATED_CODE=""
+KOKKOS_DEPRECATED_CODE_WARNINGS=""
+
 #
 # Handle arguments.
 #
@@ -188,6 +193,12 @@ do
     --boundscheck*)
       KOKKOS_BOUNDS_CHECK="--boundscheck"
       ;;
+    --deprecated-code)
+      KOKKOS_DEPRECATED_CODE="--deprecated-code"
+      ;;
+    --deprecated-code-warnings)
+      KOKKOS_DEPRECATED_CODE_WARNINGS="--deprecated-code-warnings"
+      ;;
     --build-only*)
       BUILD_ONLY=True
       ;;
@@ -441,6 +452,8 @@ elif [ "$MACHINE" = "weaver" ]; then
   IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
   CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.2.0"
   CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.4.0"
+  # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 loaded by default
+  CUDA11_MODULE_LIST="cmake/3.21.2,<COMPILER_NAME>/<COMPILER_VERSION>"
 
   # Don't do pthread with Power
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
@@ -465,6 +478,8 @@ elif [ "$MACHINE" = "weaver" ]; then
                "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.2.2 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
 
@@ -507,6 +522,8 @@ elif [ "$MACHINE" = "caraway" ]; then
   SKIP_HWLOC=True
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+  # Cuda11 usage available on the V100 queue
+  CUDA11_MODULE_LIST="cmake/3.22.2,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/8.2.0"
 
   HIPCLANG_BUILD_LIST="Hip_Serial,Hip_OpenMP"
   HIPCLANG_WARNING_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG"
@@ -514,6 +531,12 @@ elif [ "$MACHINE" = "caraway" ]; then
   # Format: (compiler module-list build-list exe-name warning-flag)
   COMPILERS=("rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
              "rocm/4.5.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+             "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
   )
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -636,6 +659,7 @@ fi
 export OMP_NUM_THREADS=8
 export OMP_PROC_BIND=spread
 export OMP_PLACES=cores
+export OMP_MAX_ACTIVE_LEVELS=1
 
 declare -i NUM_RESULTS_TO_KEEP=7
 
@@ -869,12 +893,12 @@ single_build_and_test() {
 
     # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions
     echo "  #   Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh
-    echo "        ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples ${KOKKOS_BOUNDS_CHECK} $extra_args" &>> call_generate_makefile.sh
+    echo "        ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples ${KOKKOS_BOUNDS_CHECK} ${KOKKOS_DEPRECATED_CODE} ${KOKKOS_DEPRECATED_CODE_WARNINGS} $extra_args" &>> call_generate_makefile.sh
 
     # store script command with generic path for faster copy/paste of reproducer into issues
-    echo "        \$KOKKOS_PATH/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh
+    echo "        \$KOKKOS_PATH/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples ${KOKKOS_DEPRECATED_CODE} ${KOKKOS_DEPRECATED_CODE_WARNINGS} $extra_args" &> call_generate_makefile_genericpath.sh
 
-    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} ${KOKKOS_BOUNDS_CHECK} --no-examples ${KOKKOS_DEPRECATED_CODE} ${KOKKOS_DEPRECATED_CODE_WARNINGS} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
     local make_par_lvl=12
     if [[ "$MACHINE" = white* ]]; then
       make_par_lvl=48
diff --git a/packages/kokkos/simd/CMakeLists.txt b/packages/kokkos/simd/CMakeLists.txt
new file mode 100644
index 000000000..83557e61e
--- /dev/null
+++ b/packages/kokkos/simd/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+KOKKOS_SUBPACKAGE(Simd)
+
+IF (NOT Kokkos_INSTALL_TESTING)
+  ADD_SUBDIRECTORY(src)
+ENDIF()
+
+KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
+
+KOKKOS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/simd/cmake/Dependencies.cmake b/packages/kokkos/simd/cmake/Dependencies.cmake
new file mode 100644
index 000000000..5e2915736
--- /dev/null
+++ b/packages/kokkos/simd/cmake/Dependencies.cmake
@@ -0,0 +1,5 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/packages/kokkos/simd/src/CMakeLists.txt b/packages/kokkos/simd/src/CMakeLists.txt
new file mode 100644
index 000000000..8779112bc
--- /dev/null
+++ b/packages/kokkos/simd/src/CMakeLists.txt
@@ -0,0 +1,29 @@
+#I have to leave these here for tribits
+KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB SIMD_HEADERS *.hpp)
+FILE(GLOB SIMD_SOURCES *.cpp)
+
+INSTALL (
+  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
+  DESTINATION ${KOKKOS_HEADER_DIR}
+  FILES_MATCHING PATTERN "*.hpp"
+)
+
+#-----------------------------------------------------------------------------
+
+# We have to pass the sources in here for Tribits
+# These will get ignored for standalone CMake and a true interface library made
+KOKKOS_ADD_LIBRARY(
+  kokkossimd
+  SOURCES ${SIMD_SOURCES}
+  HEADERS ${SIMD_HEADERS}
+)
+KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkossimd
+  ${KOKKOS_TOP_BUILD_DIR}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}
+)
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD.hpp b/packages/kokkos/simd/src/Kokkos_SIMD.hpp
new file mode 100644
index 000000000..a15a1b8eb
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD.hpp
@@ -0,0 +1,161 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SIMD_HPP
+#define KOKKOS_SIMD_HPP
+
+#include <Kokkos_SIMD_Common.hpp>
+
+#include <Kokkos_SIMD_Scalar.hpp>
+
+#ifdef KOKKOS_ARCH_AVX512XEON
+#include <Kokkos_SIMD_AVX512.hpp>
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace simd_abi {
+
+namespace Impl {
+
+#if defined(KOKKOS_ARCH_AVX512XEON)
+using host_native = avx512_fixed_size<8>;
+#else
+using host_native  = scalar;
+#endif
+
+template <class T>
+struct ForSpace;
+
+#ifdef KOKKOS_ENABLE_SERIAL
+template <>
+struct ForSpace<Kokkos::Serial> {
+  using type = host_native;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct ForSpace<Kokkos::Cuda> {
+  using type = scalar;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+template <>
+struct ForSpace<Kokkos::Threads> {
+  using type = host_native;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_HPX
+template <>
+struct ForSpace<Kokkos::Experimental::HPX> {
+  using type = scalar;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+template <>
+struct ForSpace<Kokkos::OpenMP> {
+  using type = host_native;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+template <>
+struct ForSpace<Kokkos::Experimental::OpenMPTarget> {
+  using type = scalar;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct ForSpace<Kokkos::Experimental::HIP> {
+  using type = scalar;
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_SYCL
+template <>
+struct ForSpace<Kokkos::Experimental::SYCL> {
+  using type = scalar;
+};
+#endif
+
+}  // namespace Impl
+
+template <class Space>
+using ForSpace = typename Impl::ForSpace<typename Space::execution_space>::type;
+
+template <class T>
+using native = ForSpace<Kokkos::DefaultExecutionSpace>;
+
+}  // namespace simd_abi
+
+template <class T>
+using native_simd = simd<T, simd_abi::native<T>>;
+template <class T>
+using native_simd_mask = simd_mask<T, simd_abi::native<T>>;
+
+namespace Impl {
+
+template <class... Abis>
+class abi_set {};
+
+#ifdef KOKKOS_ARCH_AVX512XEON
+using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>;
+#else
+using host_abi_set = abi_set<simd_abi::scalar>;
+#endif
+
+using device_abi_set = abi_set<simd_abi::scalar>;
+
+}  // namespace Impl
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
new file mode 100644
index 000000000..1df0730ac
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp
@@ -0,0 +1,1023 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SIMD_AVX512_HPP
+#define KOKKOS_SIMD_AVX512_HPP
+
+#include <functional>
+#include <type_traits>
+
+#include <Kokkos_SIMD_Common.hpp>
+
+#include <immintrin.h>
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace simd_abi {
+
+template <int N>
+class avx512_fixed_size {};
+
+}  // namespace simd_abi
+
+template <class T>
+class simd_mask<T, simd_abi::avx512_fixed_size<8>> {
+  __mmask8 m_value;
+
+ public:
+  class reference {
+    __mmask8& m_mask;
+    int m_lane;
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __mmask8 bit_mask() const {
+      return __mmask8(std::int16_t(1 << m_lane));
+    }
+
+   public:
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__mmask8& mask_arg,
+                                                    int lane_arg)
+        : m_mask(mask_arg), m_lane(lane_arg) {}
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference
+    operator=(bool value) const {
+      if (value) {
+        m_mask |= bit_mask();
+      } else {
+        m_mask &= ~bit_mask();
+      }
+      return *this;
+    }
+    KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const {
+      return (m_mask & bit_mask()) != 0;
+    }
+  };
+  using value_type                                  = bool;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
+      : m_value(-std::int16_t(value)) {}
+  template <class U>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(
+      simd_mask<U, simd_abi::avx512_fixed_size<8>> const& other)
+      : m_value(static_cast<__mmask8>(other)) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask(
+      __mmask8 const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __mmask8()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reference(m_value, int(i));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return static_cast<value_type>(reference(m_value, int(i)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask
+  operator||(simd_mask const& other) const {
+    return simd_mask(_kor_mask8(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask
+  operator&&(simd_mask const& other) const {
+    return simd_mask(_kand_mask8(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const {
+    static const __mmask8 true_value(static_cast<__mmask8>(simd_mask(true)));
+    return simd_mask(_kxor_mask8(true_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==(
+      simd_mask const& other) const {
+    return m_value == other.m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=(
+      simd_mask const& other) const {
+    return m_value != other.m_value;
+  }
+};
+
+template <>
+class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> {
+  __m256i m_value;
+
+ public:
+  using value_type = std::int32_t;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm256_set1_epi32(value_type(value))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m256i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::uint64_t, abi_type> const& other);
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
+      : m_value(
+            _mm256_setr_epi32(gen(std::integral_constant<std::size_t, 0>()),
+                              gen(std::integral_constant<std::size_t, 1>()),
+                              gen(std::integral_constant<std::size_t, 2>()),
+                              gen(std::integral_constant<std::size_t, 3>()),
+                              gen(std::integral_constant<std::size_t, 4>()),
+                              gen(std::integral_constant<std::size_t, 5>()),
+                              gen(std::integral_constant<std::size_t, 6>()),
+                              gen(std::integral_constant<std::size_t, 7>()))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)),
+                             m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<(simd const& other) const {
+    return mask_type(_mm256_cmplt_epi32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>(simd const& other) const {
+    return mask_type(_mm256_cmplt_epi32_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<=(simd const& other) const {
+    return mask_type(_mm256_cmple_epi32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>=(simd const& other) const {
+    return mask_type(_mm256_cmple_epi32_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator==(simd const& other) const {
+    return mask_type(_mm256_cmpeq_epi32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator!=(simd const& other) const {
+    return mask_type(_mm256_cmpneq_epi32_mask(m_value, other.m_value));
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
+    operator*(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mullo_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
+    operator+(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(0) - a;
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::int32_t, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::int32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
+                              static_cast<__m256i>(b)));
+}
+
+template <>
+class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> {
+  __m256i m_value;
+
+ public:
+  using value_type = std::uint32_t;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm256_set1_epi32(bit_cast<std::int32_t>(value_type(value)))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m256i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
+      : m_value(static_cast<__m256i>(other)) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<(simd const& other) const {
+    return mask_type(_mm256_cmplt_epu32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>(simd const& other) const {
+    return mask_type(_mm256_cmplt_epu32_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<=(simd const& other) const {
+    return mask_type(_mm256_cmple_epu32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>=(simd const& other) const {
+    return mask_type(_mm256_cmple_epu32_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator==(simd const& other) const {
+    return mask_type(_mm256_cmpeq_epu32_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator!=(simd const& other) const {
+    return mask_type(_mm256_cmpneq_epu32_mask(m_value, other.m_value));
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
+    operator*(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mullo_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
+    operator+(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>(
+      _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c),
+                              static_cast<__m256i>(b)));
+}
+
+template <>
+class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> {
+  __m512i m_value;
+
+ public:
+  using value_type = std::int64_t;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm512_set1_epi64(value_type(value))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other)
+      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other);
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm512_mask_storeu_epi64(ptr, static_cast<__mmask8>(mask_type(true)),
+                             m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>(int rhs) const {
+    return _mm512_srai_epi64(m_value, rhs);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator>>(simd<int, simd_abi::avx512_fixed_size<8>> const& rhs) const {
+    return _mm512_srav_epi64(m_value,
+                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<(int rhs) const {
+    return _mm512_slli_epi64(m_value, rhs);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator<<(simd<int, simd_abi::avx512_fixed_size<8>> const& rhs) const {
+    return _mm512_sllv_epi64(m_value,
+                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<(simd const& other) const {
+    return mask_type(_mm512_cmplt_epi64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>(simd const& other) const {
+    return mask_type(_mm512_cmplt_epi64_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<=(simd const& other) const {
+    return mask_type(_mm512_cmple_epi64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>=(simd const& other) const {
+    return mask_type(_mm512_cmple_epi64_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator==(simd const& other) const {
+    return mask_type(_mm512_cmpeq_epi64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator!=(simd const& other) const {
+    return mask_type(_mm512_cmpneq_epi64_mask(m_value, other.m_value));
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
+    operator*(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mullo_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
+    operator+(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(0) - a;
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::int64_t, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::int64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
+                              static_cast<__m512i>(b)));
+}
+
+template <>
+class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> {
+  __m512i m_value;
+
+ public:
+  using value_type = std::uint64_t;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm512_set1_epi64(bit_cast<std::int64_t>(value_type(value)))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr simd(__m512i const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int32_t, abi_type> const& other)
+      : m_value(_mm512_cvtepi32_epi64(static_cast<__m256i>(other))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd(
+      simd<std::int64_t, abi_type> const& other)
+      : m_value(static_cast<__m512i>(other)) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator>>(unsigned int rhs) const {
+    return _mm512_srli_epi64(m_value, rhs);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>(
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) const {
+    return _mm512_srlv_epi64(m_value,
+                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator<<(unsigned int rhs) const {
+    return _mm512_slli_epi64(m_value, rhs);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<(
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& rhs) const {
+    return _mm512_sllv_epi64(m_value,
+                             _mm512_cvtepi32_epi64(static_cast<__m256i>(rhs)));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator&(simd const& other) const {
+    return _mm512_and_epi64(m_value, other.m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd
+  operator|(simd const& other) const {
+    return _mm512_or_epi64(m_value, other.m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<(simd const& other) const {
+    return mask_type(_mm512_cmplt_epu64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>(simd const& other) const {
+    return mask_type(_mm512_cmplt_epu64_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<=(simd const& other) const {
+    return mask_type(_mm512_cmple_epu64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>=(simd const& other) const {
+    return mask_type(_mm512_cmple_epu64_mask(other.m_value, m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator==(simd const& other) const {
+    return mask_type(_mm512_cmpeq_epu64_mask(m_value, other.m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator!=(simd const& other) const {
+    return mask_type(_mm512_cmpneq_epu64_mask(m_value, other.m_value));
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
+    operator*(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mullo_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
+    operator+(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_add_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_sub_epi64(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<std::uint64_t, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_epi64(static_cast<__mmask8>(a), static_cast<__m512i>(c),
+                              static_cast<__m512i>(b)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::int32_t, simd_abi::avx512_fixed_size<8>>::simd(
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
+    : m_value(_mm512_cvtepi64_epi32(static_cast<__m512i>(other))) {}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<std::int64_t, simd_abi::avx512_fixed_size<8>>::simd(
+    simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> const& other)
+    : m_value(static_cast<__m512i>(other)) {}
+
+template <>
+class simd<double, simd_abi::avx512_fixed_size<8>> {
+  __m512d m_value;
+
+ public:
+  using value_type = double;
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using mask_type  = simd_mask<value_type, abi_type>;
+  using reference  = value_type&;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd()            = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
+    return 8;
+  }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value)
+      : m_value(_mm512_set1_pd(value_type(value))) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(double a, double b, double c,
+                                             double d, double e, double f,
+                                             double g, double h)
+      : m_value(_mm512_setr_pd(a, b, c, d, e, f, g, h)) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd(
+      __m512d const& value_in)
+      : m_value(value_in) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) {
+    return reinterpret_cast<value_type*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type
+  operator[](std::size_t i) const {
+    return reinterpret_cast<value_type const*>(&m_value)[i];
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr,
+                                                       element_aligned_tag) {
+    m_value = _mm512_loadu_pd(ptr);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(
+      value_type* ptr, element_aligned_tag) const {
+    _mm512_storeu_pd(ptr, m_value);
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d()
+      const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_LT_OS));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_GT_OS));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator<=(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_LE_OS));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator>=(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_GE_OS));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator==(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_EQ_OS));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type
+  operator!=(simd const& other) const {
+    return mask_type(_mm512_cmp_pd_mask(m_value, other.m_value, _CMP_NEQ_OS));
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    operator*(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mul_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    operator/(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_div_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    operator+(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_add_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<double, simd_abi::avx512_fixed_size<8>> const& lhs,
+              simd<double, simd_abi::avx512_fixed_size<8>> const& rhs) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_sub_pd(static_cast<__m512d>(lhs), static_cast<__m512d>(rhs)));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+    simd<double, simd_abi::avx512_fixed_size<8>>
+    operator-(simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_sub_pd(_mm512_set1_pd(0.0), static_cast<__m512d>(a)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> copysign(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
+  static const __m512i sign_mask = reinterpret_cast<__m512i>(
+      static_cast<__m512d>(simd<double, simd_abi::avx512_fixed_size<8>>(-0.0)));
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      reinterpret_cast<__m512d>(_mm512_xor_epi64(
+          _mm512_andnot_epi64(
+              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(a))),
+          _mm512_and_epi64(
+              sign_mask, reinterpret_cast<__m512i>(static_cast<__m512d>(b))))));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> abs(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  __m512d const rhs = static_cast<__m512d>(a);
+  return simd<double, simd_abi::avx512_fixed_size<8>>(reinterpret_cast<__m512d>(
+      _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                       reinterpret_cast<__m512i>(rhs))));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> sqrt(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_sqrt_pd(static_cast<__m512d>(a)));
+}
+
+#ifdef __INTEL_COMPILER
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> cbrt(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_cbrt_pd(static_cast<__m512d>(a)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> exp(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_exp_pd(static_cast<__m512d>(a)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> log(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_log_pd(static_cast<__m512d>(a)));
+}
+
+#endif
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> fma(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_fmadd_pd(static_cast<__m512d>(a), static_cast<__m512d>(b),
+                      static_cast<__m512d>(c)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> max(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_max_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> min(
+    simd<double, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& b) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_min_pd(static_cast<__m512d>(a), static_cast<__m512d>(b)));
+}
+
+KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+simd<double, simd_abi::avx512_fixed_size<8>> condition(
+    simd_mask<double, simd_abi::avx512_fixed_size<8>> const& a,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& b,
+    simd<double, simd_abi::avx512_fixed_size<8>> const& c) {
+  return simd<double, simd_abi::avx512_fixed_size<8>>(
+      _mm512_mask_blend_pd(static_cast<__mmask8>(a), static_cast<__m512d>(c),
+                           static_cast<__m512d>(b)));
+}
+
+template <>
+class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
+                             simd<double, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<double, abi_type>;
+  using mask_type  = simd_mask<double, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
+  mask() const {
+    return m_mask;
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
+  value() const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(double* mem, element_aligned_tag) const {
+    _mm512_mask_storeu_pd(mem, static_cast<__mmask8>(m_mask),
+                          static_cast<__m512d>(m_value));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void scatter_to(
+      double* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const {
+    _mm512_mask_i32scatter_pd(mem, static_cast<__mmask8>(m_mask),
+                              static_cast<__m256i>(index),
+                              static_cast<__m512d>(m_value), 8);
+  }
+};
+
+template <>
+class where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
+                       simd<double, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<double, simd_abi::avx512_fixed_size<8>>,
+          simd<double, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<double, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<double, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(double const* mem, element_aligned_tag) {
+    m_value = value_type(_mm512_mask_loadu_pd(
+        _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem));
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void gather_from(
+      double const* mem,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) {
+    m_value = value_type(_mm512_mask_i32gather_pd(
+        _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask),
+        static_cast<__m256i>(index), mem, 8));
+  }
+  template <class U, std::enable_if_t<
+                         std::is_convertible_v<
+                             U, simd<double, simd_abi::avx512_fixed_size<8>>>,
+                         bool> = false>
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    auto const x_as_value_type =
+        static_cast<simd<double, simd_abi::avx512_fixed_size<8>>>(
+            std::forward<U>(x));
+    m_value = simd<double, simd_abi::avx512_fixed_size<8>>(_mm512_mask_blend_pd(
+        static_cast<__mmask8>(m_mask), static_cast<__m512d>(m_value),
+        static_cast<__m512d>(x_as_value_type)));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
+    simd<std::int32_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<std::int32_t, abi_type>;
+  using mask_type  = simd_mask<std::int32_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
+  mask() const {
+    return m_mask;
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
+  value() const {
+    return m_value;
+  }
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_to(std::int32_t* mem, element_aligned_tag) const {
+    _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask),
+                             static_cast<__m256i>(m_value));
+  }
+};
+
+template <>
+class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
+                       simd<std::int32_t, simd_abi::avx512_fixed_size<8>>>
+    : public const_where_expression<
+          simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
+          simd<std::int32_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  where_expression(
+      simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>> const& mask_arg,
+      simd<std::int32_t, simd_abi::avx512_fixed_size<8>>& value_arg)
+      : const_where_expression(mask_arg, value_arg) {}
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION
+  void copy_from(std::int32_t const* mem, element_aligned_tag) {
+    m_value = value_type(_mm256_mask_loadu_epi32(
+        _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem));
+  }
+};
+
+template <>
+class const_where_expression<
+    simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
+    simd<std::int64_t, simd_abi::avx512_fixed_size<8>>> {
+ public:
+  using abi_type   = simd_abi::avx512_fixed_size<8>;
+  using value_type = simd<std::int64_t, abi_type>;
+  using mask_type  = simd_mask<std::int64_t, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr mask_type const&
+  mask() const {
+    return m_mask;
+  }
+  [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr value_type const&
+  value() const {
+    return m_value;
+  }
+};
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t hmax(
+    const_where_expression<
+        simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>,
+        simd<std::int32_t, simd_abi::avx512_fixed_size<8>>> const& x) {
+  return _mm512_mask_reduce_max_epi32(
+      static_cast<__mmask8>(x.mask()),
+      _mm512_castsi256_si512(static_cast<__m256i>(x.value())));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double hmin(
+    const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
+                           simd<double, simd_abi::avx512_fixed_size<8>>> const&
+        x) {
+  return _mm512_mask_reduce_min_pd(static_cast<__mmask8>(x.mask()),
+                                   static_cast<__m512d>(x.value()));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce(
+    const_where_expression<
+        simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>,
+        simd<std::int64_t, simd_abi::avx512_fixed_size<8>>> const& x,
+    std::int64_t, std::plus<>) {
+  return _mm512_mask_reduce_add_epi64(static_cast<__mmask8>(x.mask()),
+                                      static_cast<__m512i>(x.value()));
+}
+
+[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce(
+    const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>,
+                           simd<double, simd_abi::avx512_fixed_size<8>>> const&
+        x,
+    double, std::plus<>) {
+  return _mm512_mask_reduce_add_pd(static_cast<__mmask8>(x.mask()),
+                                   static_cast<__m512d>(x.value()));
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
new file mode 100644
index 000000000..ae2843b30
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp
@@ -0,0 +1,428 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SIMD_COMMON_HPP
+#define KOKKOS_SIMD_COMMON_HPP
+
+#include <cmath>
+#include <cstring>
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+
+namespace Experimental {
+
+template <class To, class From>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr To bit_cast(
+    From const& src) {
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+
+template <class T, class Abi>
+class simd;
+
+template <class T, class Abi>
+class simd_mask;
+
+struct element_aligned_tag {};
+
+// class template declarations for const_where_expression and where_expression
+
+template <class M, class T>
+class const_where_expression {
+ protected:
+  T& m_value;
+  M const& m_mask;
+
+ public:
+  const_where_expression(M const& mask_arg, T const& value_arg)
+      : m_value(const_cast<T&>(value_arg)), m_mask(mask_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION T const& value() const { return this->m_value; }
+};
+
+template <class M, class T>
+class where_expression : public const_where_expression<M, T> {
+  using base_type = const_where_expression<M, T>;
+
+ public:
+  where_expression(M const& mask_arg, T& value_arg)
+      : base_type(mask_arg, value_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION T& value() { return this->m_value; }
+};
+
+// specializations of where expression templates for the case when the
+// mask type is bool, to allow generic code to use where() on both
+// SIMD types and non-SIMD builtin arithmetic types
+
+template <class T>
+class const_where_expression<bool, T> {
+ protected:
+  T& m_value;
+  bool m_mask;
+
+ public:
+  KOKKOS_FORCEINLINE_FUNCTION
+  const_where_expression(bool mask_arg, T const& value_arg)
+      : m_value(const_cast<T&>(value_arg)), m_mask(mask_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION T const& value() const { return this->m_value; }
+};
+
+template <class T>
+class where_expression<bool, T> : public const_where_expression<bool, T> {
+  using base_type = const_where_expression<bool, T>;
+
+ public:
+  KOKKOS_FORCEINLINE_FUNCTION
+  where_expression(bool mask_arg, T& value_arg)
+      : base_type(mask_arg, value_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION T& value() { return this->m_value; }
+  template <class U,
+            std::enable_if_t<std::is_convertible_v<U, T>, bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION void operator=(U const& x) {
+    if (this->m_mask) this->m_value = x;
+  }
+};
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    where_expression<simd_mask<T, Abi>, simd<T, Abi>>
+    where(typename simd<T, Abi>::mask_type const& mask, simd<T, Abi>& value) {
+  return where_expression(mask, value);
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
+    const_where_expression<simd_mask<T, Abi>, simd<T, Abi>>
+    where(typename simd<T, Abi>::mask_type const& mask,
+          simd<T, Abi> const& value) {
+  return const_where_expression(mask, value);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION where_expression<bool, T> where(
+    bool mask, T& value) {
+  return where_expression(mask, value);
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION const_where_expression<bool, T> where(
+    bool mask, T const& value) {
+  return const_where_expression(mask, value);
+}
+
+// The code below provides:
+// operator@(simd<T, Abi>, Arithmetic)
+// operator@(Arithmetic, simd<T, Abi>)
+// operator@=(simd<T, Abi>&, U&&)
+// operator@=(where_expression<M, T>&, U&&)
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator+(
+    Experimental::simd<T, Abi> const& lhs, U rhs) {
+  using result_member = decltype(lhs[0] + rhs);
+  return Experimental::simd<result_member, Abi>(lhs) +
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator+(
+    U lhs, Experimental::simd<T, Abi> const& rhs) {
+  using result_member = decltype(lhs + rhs[0]);
+  return Experimental::simd<result_member, Abi>(lhs) +
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi>& operator+=(simd<T, Abi>& lhs,
+                                                     U&& rhs) {
+  lhs = lhs + std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class M, class T, class U>
+KOKKOS_FORCEINLINE_FUNCTION where_expression<M, T>& operator+=(
+    where_expression<M, T>& lhs, U&& rhs) {
+  lhs = lhs.value() + std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator-(
+    Experimental::simd<T, Abi> const& lhs, U rhs) {
+  using result_member = decltype(lhs[0] - rhs);
+  return Experimental::simd<result_member, Abi>(lhs) -
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator-(
+    U lhs, Experimental::simd<T, Abi> const& rhs) {
+  using result_member = decltype(lhs - rhs[0]);
+  return Experimental::simd<result_member, Abi>(lhs) -
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi>& operator-=(simd<T, Abi>& lhs,
+                                                     U&& rhs) {
+  lhs = lhs - std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class M, class T, class U>
+KOKKOS_FORCEINLINE_FUNCTION where_expression<M, T>& operator-=(
+    where_expression<M, T>& lhs, U&& rhs) {
+  lhs = lhs.value() - std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator*(
+    Experimental::simd<T, Abi> const& lhs, U rhs) {
+  using result_member = decltype(lhs[0] * rhs);
+  return Experimental::simd<result_member, Abi>(lhs) *
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator*(
+    U lhs, Experimental::simd<T, Abi> const& rhs) {
+  using result_member = decltype(lhs * rhs[0]);
+  return Experimental::simd<result_member, Abi>(lhs) *
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi>& operator*=(simd<T, Abi>& lhs,
+                                                     U&& rhs) {
+  lhs = lhs * std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class M, class T, class U>
+KOKKOS_FORCEINLINE_FUNCTION where_expression<M, T>& operator*=(
+    where_expression<M, T>& lhs, U&& rhs) {
+  lhs = lhs.value() * std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator/(
+    Experimental::simd<T, Abi> const& lhs, U rhs) {
+  using result_member = decltype(lhs[0] / rhs);
+  return Experimental::simd<result_member, Abi>(lhs) /
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi,
+          std::enable_if_t<std::is_arithmetic_v<U>, bool> = false>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto operator/(
+    U lhs, Experimental::simd<T, Abi> const& rhs) {
+  using result_member = decltype(lhs / rhs[0]);
+  return Experimental::simd<result_member, Abi>(lhs) /
+         Experimental::simd<result_member, Abi>(rhs);
+}
+
+template <class T, class U, class Abi>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi>& operator/=(simd<T, Abi>& lhs,
+                                                     U&& rhs) {
+  lhs = lhs / std::forward<U>(rhs);
+  return lhs;
+}
+
+template <class M, class T, class U>
+KOKKOS_FORCEINLINE_FUNCTION where_expression<M, T>& operator/=(
+    where_expression<M, T>& lhs, U&& rhs) {
+  lhs = lhs.value() / std::forward<U>(rhs);
+  return lhs;
+}
+
+// implement mask reductions for type bool to allow generic code to accept
+// both simd<double, Abi> and just double
+
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr bool all_of(bool a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr bool any_of(bool a) {
+  return a;
+}
+
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr bool none_of(bool a) {
+  return !a;
+}
+
+// fallback implementations of reductions across simd_mask:
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool all_of(
+    simd_mask<T, Abi> const& a) {
+  return a == simd_mask<T, Abi>(true);
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool any_of(
+    simd_mask<T, Abi> const& a) {
+  return a != simd_mask<T, Abi>(false);
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION bool none_of(
+    simd_mask<T, Abi> const& a) {
+  return a == simd_mask<T, Abi>(false);
+}
+
+}  // namespace Experimental
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> min(
+    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
+  Experimental::simd<T, Abi> result;
+  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
+    result[i] = Kokkos::min(a[i], b[i]);
+  }
+  return result;
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<T, Abi> max(
+    Experimental::simd<T, Abi> const& a, Experimental::simd<T, Abi> const& b) {
+  Experimental::simd<T, Abi> result;
+  for (std::size_t i = 0; i < Experimental::simd<T, Abi>::size(); ++i) {
+    result[i] = Kokkos::max(a[i], b[i]);
+  }
+  return result;
+}
+
+// fallback implementations of <cmath> functions.
+// individual Abi types may provide overloads with more efficient
+// implementations.
+// These are not in the Experimental namespace because their double
+// overloads are not either
+
+#define KOKKOS_IMPL_SIMD_UNARY_FUNCTION(FUNC)                               \
+  template <class Abi>                                                      \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
+  FUNC(Experimental::simd<double, Abi> const& a) {                          \
+    Experimental::simd<double, Abi> result;                                 \
+    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
+         ++i) {                                                             \
+      result[i] = Kokkos::FUNC(a[i]);                                       \
+    }                                                                       \
+    return result;                                                          \
+  }
+
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(abs)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(exp2)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log10)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(log2)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sqrt)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cbrt)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sin)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cos)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tan)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asin)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acos)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atan)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(sinh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(cosh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tanh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(asinh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(acosh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(atanh)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erf)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(erfc)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(tgamma)
+KOKKOS_IMPL_SIMD_UNARY_FUNCTION(lgamma)
+
+#define KOKKOS_IMPL_SIMD_BINARY_FUNCTION(FUNC)                              \
+  template <class Abi>                                                      \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
+  FUNC(Experimental::simd<double, Abi> const& a,                            \
+       Experimental::simd<double, Abi> const& b) {                          \
+    Experimental::simd<double, Abi> result;                                 \
+    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
+         ++i) {                                                             \
+      result[i] = Kokkos::FUNC(a[i], b[i]);                                 \
+    }                                                                       \
+    return result;                                                          \
+  }
+
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(pow)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(hypot)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(atan2)
+KOKKOS_IMPL_SIMD_BINARY_FUNCTION(copysign)
+
+#define KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(FUNC)                             \
+  template <class Abi>                                                      \
+  [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION Experimental::simd<double, Abi> \
+  FUNC(Experimental::simd<double, Abi> const& a,                            \
+       Experimental::simd<double, Abi> const& b,                            \
+       Experimental::simd<double, Abi> const& c) {                          \
+    Experimental::simd<double, Abi> result;                                 \
+    for (std::size_t i = 0; i < Experimental::simd<double, Abi>::size();    \
+         ++i) {                                                             \
+      result[i] = Kokkos::FUNC(a[i], b[i], c[i]);                           \
+    }                                                                       \
+    return result;                                                          \
+  }
+
+KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(fma)
+KOKKOS_IMPL_SIMD_TERNARY_FUNCTION(hypot)
+
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
new file mode 100644
index 000000000..f0d06695e
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp
@@ -0,0 +1,353 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SIMD_SCALAR_HPP
+#define KOKKOS_SIMD_SCALAR_HPP
+
+#include <type_traits>
+#include <climits>
+#include <cfloat>
+
+#include <Kokkos_SIMD_Common.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace simd_abi {
+
+class scalar {};
+
+}  // namespace simd_abi
+
+template <class T>
+class simd_mask<T, simd_abi::scalar> {
+  bool m_value;
+
+ public:
+  using value_type                      = bool;
+  using simd_type                       = simd<T, simd_abi::scalar>;
+  using abi_type                        = simd_abi::scalar;
+  using reference                       = value_type&;
+  KOKKOS_DEFAULTED_FUNCTION simd_mask() = default;
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 1; }
+  KOKKOS_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
+      : m_value(value) {}
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION simd_mask(
+      simd_mask<U, simd_abi::scalar> const& other)
+      : m_value(static_cast<bool>(other)) {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit operator bool() const {
+    return m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) {
+    return m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION value_type operator[](std::size_t) const {
+    return m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd_mask
+  operator||(simd_mask const& other) const {
+    return simd_mask(m_value || other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd_mask
+  operator&&(simd_mask const& other) const {
+    return simd_mask(m_value && other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd_mask operator!() const {
+    return simd_mask(!m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION bool operator==(simd_mask const& other) const {
+    return m_value == other.m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION bool operator!=(simd_mask const& other) const {
+    return m_value != other.m_value;
+  }
+};
+
+template <class T>
+class simd<T, simd_abi::scalar> {
+  T m_value;
+
+ public:
+  using value_type                            = T;
+  using abi_type                              = simd_abi::scalar;
+  using mask_type                             = simd_mask<T, abi_type>;
+  using reference                             = value_type&;
+  KOKKOS_DEFAULTED_FUNCTION simd()            = default;
+  KOKKOS_DEFAULTED_FUNCTION simd(simd const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION simd(simd&&)      = default;
+  KOKKOS_DEFAULTED_FUNCTION simd& operator=(simd const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION simd& operator=(simd&&) = default;
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 1; }
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION simd(U&& value) : m_value(value) {}
+  template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>,
+                                      bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION explicit simd(simd<U, abi_type> const& other)
+      : m_value(static_cast<U>(other)) {}
+  template <class G,
+            std::enable_if_t<
+                // basically, can you do { value_type r =
+                // gen(std::integral_constant<std::size_t, i>()); }
+                std::is_invocable_r_v<value_type, G,
+                                      std::integral_constant<std::size_t, 0>>,
+                bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION simd(G&& gen)
+      : m_value(gen(std::integral_constant<std::size_t, 0>())) {}
+  KOKKOS_FORCEINLINE_FUNCTION simd operator-() const { return simd(-m_value); }
+  KOKKOS_FORCEINLINE_FUNCTION simd operator>>(int rhs) const {
+    return simd(m_value >> rhs);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd
+  operator>>(simd<int, abi_type> const& rhs) const {
+    return simd(m_value >> static_cast<int>(rhs));
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd operator<<(int rhs) const {
+    return simd(m_value << rhs);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd
+  operator<<(simd<int, abi_type> const& rhs) const {
+    return simd(m_value << static_cast<int>(rhs));
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd operator&(simd const& other) const {
+    return m_value & other.m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION simd operator|(simd const& other) const {
+    return m_value | other.m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr explicit operator T() const {
+    return m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator<(simd const& other) const {
+    return mask_type(m_value < other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator>(simd const& other) const {
+    return mask_type(m_value > other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator<=(simd const& other) const {
+    return mask_type(m_value <= other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator>=(simd const& other) const {
+    return mask_type(m_value >= other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator==(simd const& other) const {
+    return mask_type(m_value == other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION mask_type operator!=(simd const& other) const {
+    return mask_type(m_value != other.m_value);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr,
+                                             element_aligned_tag) {
+    m_value = *ptr;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const {
+    *ptr = m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) {
+    return m_value;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION value_type operator[](std::size_t) const {
+    return m_value;
+  }
+};
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator*(
+    simd<T, simd_abi::scalar> const& lhs,
+    simd<T, simd_abi::scalar> const& rhs) {
+  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) * static_cast<T>(rhs));
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator/(
+    simd<T, simd_abi::scalar> const& lhs,
+    simd<T, simd_abi::scalar> const& rhs) {
+  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) / static_cast<T>(rhs));
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator+(
+    simd<T, simd_abi::scalar> const& lhs,
+    simd<T, simd_abi::scalar> const& rhs) {
+  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) + static_cast<T>(rhs));
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> operator-(
+    simd<T, simd_abi::scalar> const& lhs,
+    simd<T, simd_abi::scalar> const& rhs) {
+  return simd<T, simd_abi::scalar>(static_cast<T>(lhs) - static_cast<T>(rhs));
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> abs(
+    simd<T, simd_abi::scalar> const& a) {
+  return simd<T, simd_abi::scalar>(std::abs(static_cast<T>(a)));
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> sqrt(
+    simd<T, simd_abi::scalar> const& a) {
+  return simd<T, simd_abi::scalar>(std::sqrt(static_cast<T>(a)));
+}
+
+template <class T>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> fma(
+    simd<T, simd_abi::scalar> const& x, simd<T, simd_abi::scalar> const& y,
+    simd<T, simd_abi::scalar> const& z) {
+  return simd<T, simd_abi::scalar>((static_cast<T>(x) * static_cast<T>(y)) +
+                                   static_cast<T>(z));
+}
+
+template <class T>
+KOKKOS_FORCEINLINE_FUNCTION simd<T, simd_abi::scalar> condition(
+    desul::Impl::dont_deduce_this_parameter_t<
+        simd_mask<T, simd_abi::scalar>> const& a,
+    simd<T, simd_abi::scalar> const& b, simd<T, simd_abi::scalar> const& c) {
+  return simd<T, simd_abi::scalar>(static_cast<bool>(a) ? static_cast<T>(b)
+                                                        : static_cast<T>(c));
+}
+
+template <class T, class Abi>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION simd<T, Abi> copysign(
+    simd<T, Abi> const& a, simd<T, Abi> const& b) {
+  return std::copysign(static_cast<T>(a), static_cast<T>(b));
+}
+
+template <class T>
+class const_where_expression<simd_mask<T, simd_abi::scalar>,
+                             simd<T, simd_abi::scalar>> {
+ public:
+  using abi_type   = simd_abi::scalar;
+  using value_type = simd<T, abi_type>;
+  using mask_type  = simd_mask<T, abi_type>;
+
+ protected:
+  value_type& m_value;
+  mask_type const& m_mask;
+
+ public:
+  KOKKOS_FORCEINLINE_FUNCTION
+  const_where_expression(mask_type const& mask_arg, value_type const& value_arg)
+      : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION
+  mask_type const& mask() const { return m_mask; }
+  KOKKOS_FORCEINLINE_FUNCTION
+  value_type const& value() const { return m_value; }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void copy_to(T* mem, element_aligned_tag) const {
+    if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value);
+  }
+  template <class Integral>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>>
+  scatter_to(T* mem, simd<Integral, simd_abi::scalar> const& index) const {
+    if (static_cast<bool>(m_mask))
+      mem[static_cast<Integral>(index)] = static_cast<T>(m_value);
+  }
+};
+
+template <class T>
+class where_expression<simd_mask<T, simd_abi::scalar>,
+                       simd<T, simd_abi::scalar>>
+    : public const_where_expression<simd_mask<T, simd_abi::scalar>,
+                                    simd<T, simd_abi::scalar>> {
+  using base_type = const_where_expression<simd_mask<T, simd_abi::scalar>,
+                                           simd<T, simd_abi::scalar>>;
+
+ public:
+  using typename base_type::value_type;
+  KOKKOS_FORCEINLINE_FUNCTION
+  where_expression(simd_mask<T, simd_abi::scalar> const& mask_arg,
+                   simd<T, simd_abi::scalar>& value_arg)
+      : base_type(mask_arg, value_arg) {}
+  KOKKOS_FORCEINLINE_FUNCTION
+  void copy_from(T const* mem, element_aligned_tag) {
+    if (static_cast<bool>(this->m_mask)) this->m_value = *mem;
+  }
+  template <class Integral>
+  KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>>
+  gather_from(T const* mem, simd<Integral, simd_abi::scalar> const& index) {
+    if (static_cast<bool>(this->m_mask))
+      this->m_value = mem[static_cast<Integral>(index)];
+  }
+  template <class U, std::enable_if_t<
+                         std::is_convertible_v<U, simd<T, simd_abi::scalar>>,
+                         bool> = false>
+  KOKKOS_FORCEINLINE_FUNCTION void operator=(U&& x) {
+    if (static_cast<bool>(this->m_mask))
+      this->m_value =
+          static_cast<simd<T, simd_abi::scalar>>(std::forward<U>(x));
+  }
+};
+
+template <class T, class BinaryOp>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
+reduce(const_where_expression<simd_mask<T, simd_abi::scalar>,
+                              simd<T, simd_abi::scalar>> const& x,
+       T identity_element, BinaryOp) {
+  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
+                                     : identity_element;
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
+hmax(const_where_expression<simd_mask<T, simd_abi::scalar>,
+                            simd<T, simd_abi::scalar>> const& x) {
+  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
+                                     : Kokkos::reduction_identity<T>::max();
+}
+
+template <class T>
+[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T
+hmin(const_where_expression<simd_mask<T, simd_abi::scalar>,
+                            simd<T, simd_abi::scalar>> const& x) {
+  return static_cast<bool>(x.mask()) ? static_cast<T>(x.value())
+                                     : Kokkos::reduction_identity<T>::min();
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_dummy.cpp b/packages/kokkos/simd/src/Kokkos_SIMD_dummy.cpp
new file mode 100644
index 000000000..d273cb6d5
--- /dev/null
+++ b/packages/kokkos/simd/src/Kokkos_SIMD_dummy.cpp
@@ -0,0 +1,7 @@
+// This file is needed in order to get the linker language
+// for the header only submodule.
+// While we set the language properties in our normal cmake
+// path it does not get set in the Trilinos environment.
+// Furthermore, setting LINKER_LANGUAGE is only supported
+// in CMAKE 3.19 and up.
+void KOKKOS_SIMD_SRC_DUMMY_PREVENT_LINK_ERROR() {}
diff --git a/packages/kokkos/simd/unit_tests/CMakeLists.txt b/packages/kokkos/simd/unit_tests/CMakeLists.txt
new file mode 100644
index 000000000..f4de86241
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/CMakeLists.txt
@@ -0,0 +1,5 @@
+KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_SIMD
+  SOURCES
+  UnitTestMain.cpp
+  TestSIMD.cpp)
diff --git a/packages/kokkos/simd/unit_tests/TestSIMD.cpp b/packages/kokkos/simd/unit_tests/TestSIMD.cpp
new file mode 100644
index 000000000..fdf72e91c
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/TestSIMD.cpp
@@ -0,0 +1,376 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_SIMD.hpp>
+
+class gtest_checker {
+ public:
+  void truth(bool x) const { EXPECT_TRUE(x); }
+  template <class T>
+  void equality(T const& a, T const& b) const {
+    EXPECT_EQ(a, b);
+  }
+};
+
+class kokkos_checker {
+ public:
+  KOKKOS_INLINE_FUNCTION void truth(bool x) const {
+    if (!x) Kokkos::abort("SIMD unit test truth condition failed on device");
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION void equality(T const& a, T const& b) const {
+    if (a != b)
+      Kokkos::abort("SIMD unit test equality condition failed on device");
+  }
+};
+
+template <class T, class Abi>
+inline void host_check_equality(
+    Kokkos::Experimental::simd<T, Abi> const& expected_result,
+    Kokkos::Experimental::simd<T, Abi> const& computed_result,
+    std::size_t nlanes) {
+  gtest_checker checker;
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    checker.equality(expected_result[i], computed_result[i]);
+  }
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  mask_type mask(false);
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    mask[i] = true;
+  }
+  checker.equality((expected_result == computed_result) && mask, mask);
+}
+
+template <class T, class Abi>
+KOKKOS_INLINE_FUNCTION void device_check_equality(
+    Kokkos::Experimental::simd<T, Abi> const& expected_result,
+    Kokkos::Experimental::simd<T, Abi> const& computed_result,
+    std::size_t nlanes) {
+  kokkos_checker checker;
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    checker.equality(expected_result[i], computed_result[i]);
+  }
+  using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+  mask_type mask(false);
+  for (std::size_t i = 0; i < nlanes; ++i) {
+    mask[i] = true;
+  }
+  checker.equality((expected_result == computed_result) && mask, mask);
+}
+
+class load_element_aligned {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    if (n < result.size()) return false;
+    result.copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    return true;
+  }
+};
+
+class load_masked {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+    mask_type mask(false);
+    for (std::size_t i = 0; i < n; ++i) {
+      mask[i] = true;
+    }
+    where(mask, result)
+        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(!mask, result) = 0;
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type;
+    mask_type mask(false);
+    for (std::size_t i = 0; i < n; ++i) {
+      mask[i] = true;
+    }
+    where(mask, result)
+        .copy_from(mem, Kokkos::Experimental::element_aligned_tag());
+    where(!mask, result) = T(0);
+    return true;
+  }
+};
+
+class load_as_scalars {
+ public:
+  template <class T, class Abi>
+  bool host_load(T const* mem, std::size_t n,
+                 Kokkos::Experimental::simd<T, Abi>& result) const {
+    for (std::size_t i = 0; i < n; ++i) {
+      result[i] = mem[i];
+    }
+    for (std::size_t i = n; i < result.size(); ++i) {
+      result[i] = T(0);
+    }
+    return true;
+  }
+  template <class T, class Abi>
+  KOKKOS_INLINE_FUNCTION bool device_load(
+      T const* mem, std::size_t n,
+      Kokkos::Experimental::simd<T, Abi>& result) const {
+    for (std::size_t i = 0; i < n; ++i) {
+      result[i] = mem[i];
+    }
+    for (std::size_t i = n; i < result.size(); ++i) {
+      result[i] = T(0);
+    }
+    return true;
+  }
+};
+
+template <class Abi, class Loader, class BinaryOp, class T>
+void host_check_binary_op_one_loader(BinaryOp binary_op, std::size_t n,
+                                     T const* first_args,
+                                     T const* second_args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  std::size_t constexpr width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type first_arg;
+    bool const loaded_first_arg =
+        loader.host_load(first_args + i, nlanes, first_arg);
+    simd_type second_arg;
+    bool const loaded_second_arg =
+        loader.host_load(second_args + i, nlanes, second_arg);
+    if (!(loaded_first_arg && loaded_second_arg)) continue;
+    simd_type expected_result;
+    for (std::size_t lane = 0; lane < nlanes; ++lane) {
+      expected_result[lane] =
+          binary_op.on_host(first_arg[lane], second_arg[lane]);
+    }
+    simd_type const computed_result = binary_op.on_host(first_arg, second_arg);
+    host_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <class Abi, class Loader, class BinaryOp, class T>
+KOKKOS_INLINE_FUNCTION void device_check_binary_op_one_loader(
+    BinaryOp binary_op, std::size_t n, T const* first_args,
+    T const* second_args) {
+  Loader loader;
+  using simd_type             = Kokkos::Experimental::simd<T, Abi>;
+  std::size_t constexpr width = simd_type::size();
+  for (std::size_t i = 0; i < n; i += width) {
+    std::size_t const nremaining = n - i;
+    std::size_t const nlanes     = Kokkos::min(nremaining, width);
+    simd_type first_arg;
+    bool const loaded_first_arg =
+        loader.device_load(first_args + i, nlanes, first_arg);
+    simd_type second_arg;
+    bool const loaded_second_arg =
+        loader.device_load(second_args + i, nlanes, second_arg);
+    if (!(loaded_first_arg && loaded_second_arg)) continue;
+    simd_type expected_result;
+    for (std::size_t lane = 0; lane < nlanes; ++lane) {
+      expected_result[lane] =
+          binary_op.on_device(first_arg[lane], second_arg[lane]);
+    }
+    simd_type const computed_result =
+        binary_op.on_device(first_arg, second_arg);
+    device_check_equality(expected_result, computed_result, nlanes);
+  }
+}
+
+template <class Abi, class BinaryOp, class T>
+inline void host_check_binary_op_all_loaders(BinaryOp binary_op, std::size_t n,
+                                             T const* first_args,
+                                             T const* second_args) {
+  host_check_binary_op_one_loader<Abi, load_element_aligned>(
+      binary_op, n, first_args, second_args);
+  host_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
+                                                    second_args);
+  host_check_binary_op_one_loader<Abi, load_as_scalars>(
+      binary_op, n, first_args, second_args);
+}
+
+template <class Abi, class BinaryOp, class T>
+KOKKOS_INLINE_FUNCTION void device_check_binary_op_all_loaders(
+    BinaryOp binary_op, std::size_t n, T const* first_args,
+    T const* second_args) {
+  device_check_binary_op_one_loader<Abi, load_element_aligned>(
+      binary_op, n, first_args, second_args);
+  device_check_binary_op_one_loader<Abi, load_masked>(binary_op, n, first_args,
+                                                      second_args);
+  device_check_binary_op_one_loader<Abi, load_as_scalars>(
+      binary_op, n, first_args, second_args);
+}
+
+class plus {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a + b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a + b;
+  }
+};
+
+class minus {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a - b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a - b;
+  }
+};
+
+class multiplies {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a * b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a * b;
+  }
+};
+
+class divides {
+ public:
+  template <class T>
+  auto on_host(T const& a, T const& b) const {
+    return a / b;
+  }
+  template <class T>
+  KOKKOS_INLINE_FUNCTION auto on_device(T const& a, T const& b) const {
+    return a / b;
+  }
+};
+
+template <class Abi>
+inline void host_check_math_ops() {
+  std::size_t constexpr n     = 11;
+  double const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+  double const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
+  host_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
+  host_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
+  host_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
+                                        second_args);
+  host_check_binary_op_all_loaders<Abi>(divides(), n, first_args, second_args);
+}
+
+template <class Abi>
+KOKKOS_INLINE_FUNCTION void device_check_math_ops() {
+  std::size_t constexpr n     = 11;
+  double const first_args[n]  = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2};
+  double const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2};
+  device_check_binary_op_all_loaders<Abi>(plus(), n, first_args, second_args);
+  device_check_binary_op_all_loaders<Abi>(minus(), n, first_args, second_args);
+  device_check_binary_op_all_loaders<Abi>(multiplies(), n, first_args,
+                                          second_args);
+  device_check_binary_op_all_loaders<Abi>(divides(), n, first_args,
+                                          second_args);
+}
+
+template <class Abi>
+inline void host_check_abi() {
+  host_check_math_ops<Abi>();
+}
+
+template <class Abi>
+KOKKOS_INLINE_FUNCTION void device_check_abi() {
+  device_check_math_ops<Abi>();
+}
+
+inline void host_check_abis(Kokkos::Experimental::Impl::abi_set<>) {}
+
+KOKKOS_INLINE_FUNCTION void device_check_abis(
+    Kokkos::Experimental::Impl::abi_set<>) {}
+
+template <class FirstAbi, class... RestAbis>
+inline void host_check_abis(
+    Kokkos::Experimental::Impl::abi_set<FirstAbi, RestAbis...>) {
+  host_check_abi<FirstAbi>();
+  host_check_abis(Kokkos::Experimental::Impl::abi_set<RestAbis...>());
+}
+
+template <class FirstAbi, class... RestAbis>
+KOKKOS_INLINE_FUNCTION void device_check_abis(
+    Kokkos::Experimental::Impl::abi_set<FirstAbi, RestAbis...>) {
+  device_check_abi<FirstAbi>();
+  device_check_abis(Kokkos::Experimental::Impl::abi_set<RestAbis...>());
+}
+
+TEST(simd, host) {
+  host_check_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+class simd_device_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_abis(Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, device) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_functor());
+}
diff --git a/packages/kokkos/simd/unit_tests/UnitTestMain.cpp b/packages/kokkos/simd/unit_tests/UnitTestMain.cpp
new file mode 100644
index 000000000..e245aad35
--- /dev/null
+++ b/packages/kokkos/simd/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+int main(int argc, char *argv[]) {
+  Kokkos::initialize(argc, argv);
+  ::testing::InitGoogleTest(&argc, argv);
+  int result = RUN_ALL_TESTS();
+  Kokkos::finalize();
+  return result;
+}
diff --git a/packages/kokkos/core/src/desul/.clang-format b/packages/kokkos/tpls/desul/include/desul/.clang-format
similarity index 100%
rename from packages/kokkos/core/src/desul/.clang-format
rename to packages/kokkos/tpls/desul/include/desul/.clang-format
diff --git a/packages/kokkos/core/src/desul/atomics.hpp b/packages/kokkos/tpls/desul/include/desul/atomics.hpp
similarity index 98%
rename from packages/kokkos/core/src/desul/atomics.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics.hpp
index ab3fe2539..8ba5b0f3a 100644
--- a/packages/kokkos/core/src/desul/atomics.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -9,11 +9,10 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifndef DESUL_ATOMICS_HPP_
 #define DESUL_ATOMICS_HPP_
 
-#include "desul/atomics/Macros.hpp"
-
 #include "desul/atomics/Atomic_Ref.hpp"
 #include "desul/atomics/Compare_Exchange.hpp"
 #include "desul/atomics/Generic.hpp"
 #include "desul/atomics/Lock_Array.hpp"
+#include "desul/atomics/Macros.hpp"
 
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp
similarity index 91%
rename from packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp
index 73cd01a7e..fbf2dcf6b 100644
--- a/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp
@@ -103,10 +103,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
   DESUL_FUNCTION bool compare_exchange_weak(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_weak(expected,
-                          desired,
-                          order,
-                          cmpexch_failure_memory_order<_MemoryOrder>(),
-                          MemoryScope());
+                                 desired,
+                                 order,
+                                 cmpexch_failure_memory_order<_MemoryOrder>(),
+                                 MemoryScope());
   }
 
   template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
@@ -123,10 +123,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
   DESUL_FUNCTION bool compare_exchange_strong(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_strong(expected,
-                            desired,
-                            order,
-                            cmpexch_failure_memory_order<_MemoryOrder>(),
-                            MemoryScope());
+                                   desired,
+                                   order,
+                                   cmpexch_failure_memory_order<_MemoryOrder>(),
+                                   MemoryScope());
   }
 };
 
@@ -195,10 +195,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
   DESUL_FUNCTION bool compare_exchange_weak(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_weak(expected,
-                          desired,
-                          order,
-                          cmpexch_failure_memory_order<_MemoryOrder>(),
-                          MemoryScope());
+                                 desired,
+                                 order,
+                                 cmpexch_failure_memory_order<_MemoryOrder>(),
+                                 MemoryScope());
   }
 
   template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
@@ -215,10 +215,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
   DESUL_FUNCTION bool compare_exchange_strong(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_strong(expected,
-                            desired,
-                            order,
-                            cmpexch_failure_memory_order<_MemoryOrder>(),
-                            MemoryScope());
+                                   desired,
+                                   order,
+                                   cmpexch_failure_memory_order<_MemoryOrder>(),
+                                   MemoryScope());
   }
 
   template <typename _MemoryOrder = MemoryOrder>
@@ -348,10 +348,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
   DESUL_FUNCTION bool compare_exchange_weak(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_weak(expected,
-                          desired,
-                          order,
-                          cmpexch_failure_memory_order<_MemoryOrder>(),
-                          MemoryScope());
+                                 desired,
+                                 order,
+                                 cmpexch_failure_memory_order<_MemoryOrder>(),
+                                 MemoryScope());
   }
 
   template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
@@ -368,10 +368,10 @@ struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
   DESUL_FUNCTION bool compare_exchange_strong(
       T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_strong(expected,
-                            desired,
-                            order,
-                            cmpexch_failure_memory_order<_MemoryOrder>(),
-                            MemoryScope());
+                                   desired,
+                                   order,
+                                   cmpexch_failure_memory_order<_MemoryOrder>(),
+                                   MemoryScope());
   }
 
   template <typename _MemoryOrder = MemoryOrder>
@@ -457,10 +457,10 @@ struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
   DESUL_FUNCTION bool compare_exchange_weak(
       T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_weak(expected,
-                          desired,
-                          order,
-                          cmpexch_failure_memory_order<_MemoryOrder>(),
-                          MemoryScope());
+                                 desired,
+                                 order,
+                                 cmpexch_failure_memory_order<_MemoryOrder>(),
+                                 MemoryScope());
   }
 
   template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
@@ -477,10 +477,10 @@ struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
   DESUL_FUNCTION bool compare_exchange_strong(
       T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
     return compare_exchange_strong(expected,
-                            desired,
-                            order,
-                            cmpexch_failure_memory_order<_MemoryOrder>(),
-                            MemoryScope());
+                                   desired,
+                                   order,
+                                   cmpexch_failure_memory_order<_MemoryOrder>(),
+                                   MemoryScope());
   }
 
   template <typename _MemoryOrder = MemoryOrder>
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/CUDA.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/CUDA.hpp
new file mode 100644
index 000000000..87c0df4af
--- /dev/null
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/CUDA.hpp
@@ -0,0 +1,664 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_CUDA_HPP_
+#define DESUL_ATOMICS_CUDA_HPP_
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+// When building with Clang we need to include the device functions always since Clang
+// must see a consistent overload set in both device and host compilation, but that
+// means we need to know on the host what to make visible, i.e. we need a host side
+// compile knowledge of architecture.
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA))
+#define DESUL_HAVE_CUDA_ATOMICS_ASM
+#include <desul/atomics/cuda/CUDA_asm.hpp>
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
+namespace desul {
+namespace Impl {
+template <class T>
+struct is_cuda_atomic_integer_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long int>::value;
+};
+
+template <class T>
+struct is_cuda_atomic_add_type {
+  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+                                std::is_same<T, double>::value ||
+#endif
+                                std::is_same<T, float>::value;
+};
+
+template <class T>
+struct is_cuda_atomic_sub_type {
+  static constexpr bool value =
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value;
+};
+}  // namespace Impl
+
+// Atomic Add
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Sub
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Wrap around atomic add
+__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrderRelaxed,
+                                                    MemoryScopeDevice) {
+  return atomicInc(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrder,
+                                                    MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrder,
+                                                    MemoryScopeCore) {
+  return atomic_fetch_inc_mod(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Wrap around atomic sub
+__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrderRelaxed,
+                                                    MemoryScopeDevice) {
+  return atomicDec(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrder,
+                                                    MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec_mod(unsigned int* dest,
+                                                    unsigned int val,
+                                                    MemoryOrder,
+                                                    MemoryScopeCore) {
+  return atomic_fetch_dec_mod(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Inc
+template <typename T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_inc(T* dest, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, T(1));
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_inc(T* dest, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, T(1));
+  __threadfence();
+
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_add_type<T>::value, T>
+    atomic_fetch_inc(T* dest, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, T(1), MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Dec
+template <typename T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_dec(T* dest, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, T(1));
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_dec(T* dest, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, T(1));
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_sub_type<T>::value, T>
+    atomic_fetch_dec(T* dest, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, T(1), MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Max
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Min
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic And
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic XOR
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic OR
+template <class T>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest, val);
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <class T, class MemoryOrder>
+__device__ inline
+    std::enable_if_t<Impl::is_cuda_atomic_integer_type<T>::value, T>
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+}  // namespace desul
+#endif
+
+#if !defined(__NVCC__)
+// Functions defined as device functions in CUDA which don't exist in the GCC overload
+// set
+namespace desul {
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE, ORDER, SCOPE)                      \
+  inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void)atomic_fetch_add(dest, val, order, scope);                             \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,
+                                MemoryOrderRelaxed,
+                                MemoryScopeDevice);  // only for ASM?
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,
+                                MemoryOrderRelaxed,
+                                MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double, MemoryOrderRelaxed, MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE, ORDER, SCOPE)                      \
+  inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void)atomic_fetch_sub(dest, val, order, scope);                             \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,
+                                MemoryOrderRelaxed,
+                                MemoryScopeDevice);  // only for ASM?
+DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double, MemoryOrderRelaxed, MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE, ORDER, SCOPE)            \
+  inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void)atomic_fetch_inc(dest, order, scope);                        \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,
+                                MemoryOrderRelaxed,
+                                MemoryScopeDevice);  // only for ASM?
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE, ORDER, SCOPE)            \
+  inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void)atomic_fetch_dec(dest, order, scope);                        \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,
+                                MemoryOrderRelaxed,
+                                MemoryScopeDevice);  // only for ASM?
+
+#endif  // DESUL_HAVE_CUDA_ATOMICS_ASM
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_INC_MOD(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_inc_mod(TYPE* dest, TYPE val, ORDER order, SCOPE scope) { \
+    using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(TYPE)>::type;   \
+    cas_t oldval = reinterpret_cast<cas_t&>(*dest);                                  \
+    cas_t assume = oldval;                                                           \
+    do {                                                                             \
+      assume = oldval;                                                               \
+      TYPE newval = (reinterpret_cast<TYPE&>(assume) >= val)                         \
+                        ? static_cast<TYPE>(0)                                       \
+                        : reinterpret_cast<TYPE&>(assume) + static_cast<TYPE>(1);    \
+      oldval = desul::atomic_compare_exchange(reinterpret_cast<cas_t*>(dest),        \
+                                              assume,                                \
+                                              reinterpret_cast<cas_t&>(newval),      \
+                                              order,                                 \
+                                              scope);                                \
+    } while (assume != oldval);                                                      \
+    return reinterpret_cast<TYPE&>(oldval);                                          \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_INC_MOD(unsigned int,
+                                    MemoryOrderRelaxed,
+                                    MemoryScopeDevice);
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC_MOD(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_dec_mod(TYPE* dest, TYPE val, ORDER order, SCOPE scope) { \
+    using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(TYPE)>::type;   \
+    cas_t oldval = reinterpret_cast<cas_t&>(*dest);                                  \
+    cas_t assume = oldval;                                                           \
+    do {                                                                             \
+      assume = oldval;                                                               \
+      TYPE newval = ((reinterpret_cast<TYPE&>(assume) == static_cast<TYPE>(0)) |     \
+                     (reinterpret_cast<TYPE&>(assume) > val))                        \
+                        ? val                                                        \
+                        : reinterpret_cast<TYPE&>(assume) - static_cast<TYPE>(1);    \
+      oldval = desul::atomic_compare_exchange(reinterpret_cast<cas_t*>(dest),        \
+                                              assume,                                \
+                                              reinterpret_cast<cas_t&>(newval),      \
+                                              order,                                 \
+                                              scope);                                \
+    } while (assume != oldval);                                                      \
+    return reinterpret_cast<TYPE&>(oldval);                                          \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_DEC_MOD(unsigned int,
+                                    MemoryOrderRelaxed,
+                                    MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    return Impl::atomic_fetch_oper(                                                    \
+        Impl::AddOper<TYPE, const TYPE>(), dest, val, order, scope);                   \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double, MemoryOrderRelaxed, MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    return Impl::atomic_fetch_oper(                                                    \
+        Impl::SubOper<TYPE, const TYPE>(), dest, val, order, scope);                   \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double, MemoryOrderRelaxed, MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    return Impl::atomic_fetch_oper(                                                    \
+        Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope);                   \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);  // only for ASM?
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long
+//  long,MemoryOrderRelaxed,MemoryScopeDevice);
+
+#define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE, ORDER, SCOPE)                      \
+  inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    return Impl::atomic_fetch_oper(                                                    \
+        Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope);                   \
+  }
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int, MemoryOrderRelaxed, MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);  // only for ASM?
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);
+DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,
+                                      MemoryOrderRelaxed,
+                                      MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long
+//  long,MemoryOrderRelaxed,MemoryScopeDevice); inline void atomic_fetch_max(int32_t*
+//  const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+
+}  // namespace desul
+
+// Functions defined int the GCC overload set but not in the device overload set
+namespace desul {
+__device__ inline unsigned long long atomic_fetch_add(unsigned long long* const dest,
+                                                      unsigned long long val,
+                                                      MemoryOrderRelaxed order,
+                                                      MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::AddOper<unsigned long long, const unsigned long long>(),
+      dest,
+      val,
+      order,
+      scope);
+}
+__device__ inline long long atomic_fetch_add(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_add(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::AddOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_fetch_sub(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_sub(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::SubOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_max(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::MaxOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_min(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::MinOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_or(long* const dest,
+                                       long val,
+                                       MemoryOrderRelaxed order,
+                                       MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::OrOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_fetch_or(long long* const dest,
+                                            long long val,
+                                            MemoryOrderRelaxed order,
+                                            MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_xor(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::XorOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_fetch_xor(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_fetch_and(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::AndOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_fetch_and(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+}
+
+__device__ inline unsigned long long atomic_add_fetch(unsigned long long* const dest,
+                                                      unsigned long long val,
+                                                      MemoryOrderRelaxed order,
+                                                      MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::AddOper<unsigned long long, const unsigned long long>(),
+      dest,
+      val,
+      order,
+      scope);
+}
+__device__ inline long long atomic_add_fetch(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_add_fetch(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::AddOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_sub_fetch(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_sub_fetch(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::SubOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_or_fetch(long long* const dest,
+                                            long long val,
+                                            MemoryOrderRelaxed order,
+                                            MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_or_fetch(long* const dest,
+                                       long val,
+                                       MemoryOrderRelaxed order,
+                                       MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::OrOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_xor_fetch(long long* const dest,
+                                             long long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_xor_fetch(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::XorOper<long, const long>(), dest, val, order, scope);
+}
+__device__ inline long long atomic_and_fetch(long long* const dest,
+                                             long val,
+                                             MemoryOrderRelaxed order,
+                                             MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+}
+__device__ inline long atomic_and_fetch(long* const dest,
+                                        long val,
+                                        MemoryOrderRelaxed order,
+                                        MemoryScopeDevice scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::AndOper<long, const long>(), dest, val, order, scope);
+}
+}  // namespace desul
+#endif
+
+#endif  // DESUL_HAVE_CUDA_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Common.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Common.hpp
similarity index 91%
rename from packages/kokkos/core/src/desul/atomics/Common.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Common.hpp
index 1b8dc9f58..aef098e4d 100644
--- a/packages/kokkos/core/src/desul/atomics/Common.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Common.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -8,11 +8,12 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #ifndef DESUL_ATOMICS_COMMON_HPP_
 #define DESUL_ATOMICS_COMMON_HPP_
-#include "desul/atomics/Macros.hpp"
-#include <cstdint>
 #include <atomic>
+#include <cstdint>
 #include <type_traits>
 
+#include "desul/atomics/Macros.hpp"
+
 namespace desul {
 struct alignas(16) Dummy16ByteValue {
   int64_t value1;
@@ -137,20 +138,21 @@ using cmpexch_failure_memory_order =
     typename CmpExchFailureOrder<MemoryOrder>::memory_order;
 }  // namespace Impl
 
-}
+}  // namespace desul
 
-// We should in principle use std::numeric_limits, but that requires constexpr function support on device
-// Currently that is still considered experimetal on CUDA and sometimes not reliable.
+// We should in principle use std::numeric_limits, but that requires constexpr function
+// support on device Currently that is still considered experimetal on CUDA and
+// sometimes not reliable.
 namespace desul {
 namespace Impl {
-template<class T>
+template <class T>
 struct numeric_limits_max;
 
-template<>
+template <>
 struct numeric_limits_max<uint32_t> {
   static constexpr uint32_t value = 0xffffffffu;
 };
-template<>
+template <>
 struct numeric_limits_max<uint64_t> {
   static constexpr uint64_t value = 0xfffffffflu;
 };
@@ -172,30 +174,32 @@ DESUL_INLINE_FUNCTION bool atomic_is_lock_free() noexcept {
       ;
 }
 
-template<std::size_t N>
+template <std::size_t N>
 struct atomic_compare_exchange_type;
 
-template<>
+template <>
 struct atomic_compare_exchange_type<4> {
   using type = int32_t;
 };
 
-template<>
+template <>
 struct atomic_compare_exchange_type<8> {
   using type = int64_t;
 };
 
-template<>
+template <>
 struct atomic_compare_exchange_type<16> {
   using type = Dummy16ByteValue;
 };
 
-template<class T>
-struct dont_deduce_this_parameter { using type = T; };
+template <class T>
+struct dont_deduce_this_parameter {
+  using type = T;
+};
 
-template<class T>
+template <class T>
 using dont_deduce_this_parameter_t = typename dont_deduce_this_parameter<T>::type;
 
-}
-}
+}  // namespace Impl
+}  // namespace desul
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
similarity index 99%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
index 81c90db79..d947dac27 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp
@@ -9,9 +9,8 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
 #define DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
 
-#include "desul/atomics/Macros.hpp"
-
 #include "desul/atomics/Compare_Exchange_ScopeCaller.hpp"
+#include "desul/atomics/Macros.hpp"
 
 #ifdef DESUL_HAVE_GCC_ATOMICS
 #include "desul/atomics/Compare_Exchange_GCC.hpp"
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp
similarity index 69%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp
index abe566c7b..310c59f55 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_CUDA.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -40,7 +40,7 @@ __device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
 __device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
   __threadfence_block();
 }
-#if (__CUDA_ARCH__>=600) || !defined(__NVCC__)
+#if (__CUDA_ARCH__ >= 600) || !defined(__NVCC__)
 __device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
   __threadfence_system();
 }
@@ -55,19 +55,21 @@ __device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
 }
 #endif
 #endif
-}
+}  // namespace desul
 
-// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we do NOT have a way
-// of having the code included for clang only when the CC is smaller than 700
-// But on Clang the device side symbol list must be independent of __CUDA_ARCH__
+// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we
+// do NOT have a way of having the code included for clang only when the CC is smaller
+// than 700 But on Clang the device side symbol list must be independent of
+// __CUDA_ARCH__
 // FIXME temporary fix for https://github.com/kokkos/kokkos/issues/4390
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) || \
-(!defined(__NVCC__) && defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA) && 0)
+    (!defined(__NVCC__) && defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA) && 0)
 namespace desul {
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
   unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
                                       reinterpret_cast<unsigned int&>(compare),
                                       reinterpret_cast<unsigned int&>(value));
@@ -76,7 +78,8 @@ __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_excha
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
   unsigned long long int return_val =
       atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
                 reinterpret_cast<unsigned long long int&>(compare),
@@ -85,34 +88,41 @@ __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_excha
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
-  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
   return return_val;
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
     T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
-  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
   unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
                                        reinterpret_cast<unsigned int&>(value));
   return reinterpret_cast<T&>(return_val);
@@ -120,7 +130,8 @@ __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
     T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
-  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
   unsigned long long int return_val =
       atomicExch(reinterpret_cast<unsigned long long int*>(dest),
                  reinterpret_cast<unsigned long long int&>(value));
@@ -128,27 +139,27 @@ __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
-    T* const dest, T value, MemoryOrderRelease, MemoryScope) {
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderRelease, MemoryScope) {
   T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return reinterpret_cast<T&>(return_val);
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
-    T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
   T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
   return reinterpret_cast<T&>(return_val);
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
-    T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
   T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return reinterpret_cast<T&>(return_val);
 }
 }  // namespace desul
@@ -162,8 +173,8 @@ __device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type at
 // We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
 // Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
 // clang with pre Volta as CUDA compiler
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
-     (!defined(__NVCC__) && !defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_CUDA_ARCH_IS_PRE_VOLTA))
 #include <desul/atomics/cuda/CUDA_asm_exchange.hpp>
 #endif
 
@@ -174,42 +185,45 @@ namespace desul {
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
     T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
     T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
-  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
-  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
-  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
-}
+}  // namespace desul
 #endif
 
 #if defined(__CUDA_ARCH__) || !defined(__NVCC__)
 namespace desul {
 template <typename T, class MemoryOrder, class MemoryScope>
-__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
   // This is a way to avoid dead lock in a warp or wave front
   T return_val;
@@ -220,12 +234,13 @@ __device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::typ
   while (active != done_active) {
     if (!done) {
       if (Impl::lock_address_cuda((void*)dest, scope)) {
-        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
-        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
         return_val = *dest;
-        if(return_val == compare) {
+        if (return_val == compare) {
           *dest = value;
-          atomic_thread_fence(MemoryOrderRelease(),scope);
+          atomic_thread_fence(MemoryOrderRelease(), scope);
         }
         Impl::unlock_address_cuda((void*)dest, scope);
         done = 1;
@@ -236,8 +251,8 @@ __device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::typ
   return return_val;
 }
 template <typename T, class MemoryOrder, class MemoryScope>
-__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
-    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
   // This is a way to avoid dead lock in a warp or wave front
   T return_val;
   int done = 0;
@@ -247,11 +262,12 @@ __device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::typ
   while (active != done_active) {
     if (!done) {
       if (Impl::lock_address_cuda((void*)dest, scope)) {
-        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
-        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
         return_val = *dest;
         *dest = value;
-        atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderRelease(), scope);
         Impl::unlock_address_cuda((void*)dest, scope);
         done = 1;
       }
@@ -260,9 +276,8 @@ __device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::typ
   }
   return return_val;
 }
-}
+}  // namespace desul
 #endif
 
-
 #endif
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_GCC.hpp
similarity index 65%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_GCC.hpp
index 418bea0b8..fad3c43b0 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_GCC.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -18,39 +18,39 @@ SPDX-License-Identifier: (BSD-3-Clause)
 namespace desul {
 
 namespace Impl {
-template<class T>
+template <class T>
 struct atomic_exchange_available_gcc {
   constexpr static bool value =
 #ifndef DESUL_HAVE_LIBATOMIC
-    ((sizeof(T)==4 && alignof(T)==4) ||
+      ((sizeof(T) == 4 && alignof(T) == 4) ||
 #ifdef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
-     (sizeof(T)==16 && alignof(T)==16) ||
+       (sizeof(T) == 16 && alignof(T) == 16) ||
 #endif
-     (sizeof(T)==8 && alignof(T)==8)) &&
+       (sizeof(T) == 8 && alignof(T) == 8)) &&
 #endif
-    std::is_trivially_copyable<T>::value;
+      std::is_trivially_copyable<T>::value;
 };
-} //namespace Impl
+}  // namespace Impl
 
-#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#if defined(__clang__) && (__clang_major__ >= 7) && !defined(__APPLE__)
+// clang-format off
 // Disable warning for large atomics on clang 7 and up (checked with godbolt)
 // error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
 // https://godbolt.org/z/G7YhqhbG6
+// clang-format on
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Watomic-alignment"
 #endif
-template<class MemoryOrder, class MemoryScope>
+template <class MemoryOrder, class MemoryScope>
 void atomic_thread_fence(MemoryOrder, MemoryScope) {
   __atomic_thread_fence(GCCMemoryOrder<MemoryOrder>::value);
 }
 
 template <typename T, class MemoryOrder, class MemoryScope>
-std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
-atomic_exchange(
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T> atomic_exchange(
     T* dest, T value, MemoryOrder, MemoryScope) {
   T return_val;
-  __atomic_exchange(
-     dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
+  __atomic_exchange(dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
   return return_val;
 }
 
@@ -58,17 +58,19 @@ atomic_exchange(
 // Those two get handled separatly.
 template <typename T, class MemoryOrder, class MemoryScope>
 std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
-atomic_compare_exchange(
-    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
-  (void)__atomic_compare_exchange(
-      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+atomic_compare_exchange(T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  (void)__atomic_compare_exchange(dest,
+                                  &compare,
+                                  &value,
+                                  false,
+                                  GCCMemoryOrder<MemoryOrder>::value,
+                                  GCCMemoryOrder<MemoryOrder>::value);
   return compare;
 }
 
 template <typename T, class MemoryScope>
 std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
-atomic_compare_exchange(
-    T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+atomic_compare_exchange(T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
   (void)__atomic_compare_exchange(
       dest, &compare, &value, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
   return compare;
@@ -76,14 +78,13 @@ atomic_compare_exchange(
 
 template <typename T, class MemoryScope>
 std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
-atomic_compare_exchange(
-    T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+atomic_compare_exchange(T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
   (void)__atomic_compare_exchange(
       dest, &compare, &value, false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
   return compare;
 }
 
-#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#if defined(__clang__) && (__clang_major__ >= 7) && !defined(__APPLE__)
 #pragma GCC diagnostic pop
 #endif
 }  // namespace desul
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp
similarity index 88%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp
index d6bf04a7e..96739bc1f 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp
@@ -13,51 +13,39 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #ifdef DESUL_HAVE_HIP_ATOMICS
 namespace desul {
-#if defined(__HIP_DEVICE_COMPILE__)
 inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
   __threadfence();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
   __threadfence();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
   __threadfence();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
   __threadfence();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
   __threadfence_block();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
   __threadfence_block();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
   __threadfence_block();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
   __threadfence_block();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
   __threadfence_system();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
   __threadfence_system();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
   __threadfence_system();
 }
-
 inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
   __threadfence_system();
 }
@@ -165,24 +153,15 @@ atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
 template <typename T, class MemoryScope>
 __device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
 atomic_exchange(T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
-          atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
-            T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
-              atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
-                return reinterpret_cast<T&>(return_val);
-}
-
-template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
-    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
   atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
-  T return_val = atomic_compare_exchange(
-      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
   atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
-  return return_val;
+  return reinterpret_cast<T&>(return_val);
 }
 
 template <typename T, class MemoryScope>
-__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
     T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
   atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
   T return_val = atomic_compare_exchange(
@@ -192,10 +171,9 @@ __device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_excha
 }
 
 template <typename T, class MemoryOrder, class MemoryScope>
-DESUL_INLINE_FUNCTION __device__
-    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
-    atomic_compare_exchange(
-        T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
   // This is a way to avoid dead lock in a warp or wave front
   T return_val;
   int done = 0;
@@ -222,9 +200,8 @@ DESUL_INLINE_FUNCTION __device__
 }
 
 template <typename T, class MemoryOrder, class MemoryScope>
-DESUL_INLINE_FUNCTION __device__
-    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
-    atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
   // This is a way to avoid dead lock in a warp or wave front
   T return_val;
   int done = 0;
@@ -247,7 +224,6 @@ DESUL_INLINE_FUNCTION __device__
   }
   return return_val;
 }
-#endif
 }  // namespace desul
 #endif
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_MSVC.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_MSVC.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenMP.hpp
similarity index 71%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenMP.hpp
index ded401f52..dfea81a4d 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenMP.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -7,101 +7,104 @@ SPDX-License-Identifier: (BSD-3-Clause)
 */
 #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
 #define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
-#include "desul/atomics/Common.hpp"
-#include <cstdio>
 #include <omp.h>
 
+#include "desul/atomics/Common.hpp"
+
 #ifdef DESUL_HAVE_OPENMP_ATOMICS
 namespace desul {
 
 #if _OPENMP > 201800
 // atomic_thread_fence for Core Scope
 inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
-  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
-  #pragma omp flush acq_rel
+// There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+#pragma omp flush acq_rel
 }
 inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
-  #pragma omp flush acq_rel
+#pragma omp flush acq_rel
 }
 inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
-  #pragma omp flush release
+#pragma omp flush release
 }
 inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
-  #pragma omp flush acquire
+#pragma omp flush acquire
 }
 // atomic_thread_fence for Device Scope
 inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
-  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
-  #pragma omp flush acq_rel
+// There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+#pragma omp flush acq_rel
 }
 inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
-  #pragma omp flush acq_rel
+#pragma omp flush acq_rel
 }
 inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
-  #pragma omp flush release
+#pragma omp flush release
 }
 inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
-  #pragma omp flush acquire
+#pragma omp flush acquire
 }
 #else
 // atomic_thread_fence for Core Scope
 inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
-  #pragma omp flush
+#pragma omp flush
 }
 // atomic_thread_fence for Device Scope
 inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
-  #pragma omp flush
+#pragma omp flush
 }
 inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
-  #pragma omp flush
+#pragma omp flush
 }
 #endif
 
 template <typename T, class MemoryOrder, class MemoryScope>
-T atomic_exchange(
-    T* dest, T value, MemoryOrder, MemoryScope) {
+T atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) {
   T return_val;
-  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
-    atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  if (!std::is_same<MemoryOrder, MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
   T& x = *dest;
-  #pragma omp atomic capture
-  { return_val = x; x = value; }
-  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
-    atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+#pragma omp atomic capture
+  {
+    return_val = x;
+    x = value;
+  }
+  if (!std::is_same<MemoryOrder, MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
   return return_val;
 }
 
-// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that this works
-// Note that means we test this in OpenMPTarget offload regions!
+// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that
+// this works Note that means we test this in OpenMPTarget offload regions!
 template <typename T, class MemoryOrder, class MemoryScope>
-std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)),T> atomic_compare_exchange(
+std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)), T> atomic_compare_exchange(
     T* dest, T compare, T value, MemoryOrder, MemoryScope) {
   using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(T)>::type;
-  cas_t retval = __sync_val_compare_and_swap(
-     reinterpret_cast<volatile cas_t*>(dest), 
-     reinterpret_cast<cas_t&>(compare), 
-     reinterpret_cast<cas_t&>(value));
+  cas_t retval = __sync_val_compare_and_swap(reinterpret_cast<volatile cas_t*>(dest),
+                                             reinterpret_cast<cas_t&>(compare),
+                                             reinterpret_cast<cas_t&>(value));
   return reinterpret_cast<T&>(retval);
 }
 
-#if defined(__clang__) && (__clang_major__>=7)
+#if defined(__clang__) && (__clang_major__ >= 7)
 // Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// clang-format off
 // error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+// clang-format on
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Watomic-alignment"
 #endif
@@ -130,7 +133,7 @@ atomic_compare_exchange(T* /*dest*/, T /*compare*/, T value, MemoryOrder, Memory
 }
 #pragma omp end declare variant
 
-#if defined(__clang__) && (__clang_major__>=7)
+#if defined(__clang__) && (__clang_major__ >= 7)
 #pragma GCC diagnostic pop
 #endif
 
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
new file mode 100644
index 000000000..6c8c68511
--- /dev/null
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+
+// clang-format off
+#include "desul/atomics/SYCLConversions.hpp"
+#include "desul/atomics/Common.hpp"
+
+#include <CL/sycl.hpp>
+// clang-format on
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+namespace desul {
+
+template <class MemoryOrder, class MemoryScope>
+inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  sycl::atomic_fence(
+      Impl::DesulToSYCLMemoryOrder<MemoryOrder, /*extended namespace*/ false>::value,
+      Impl::DesulToSYCLMemoryScope<MemoryScope, /*extended namespace*/ false>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  Impl::sycl_atomic_ref<unsigned int, MemoryOrder, MemoryScope> dest_ref(
+      *reinterpret_cast<unsigned int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare),
+                                   *reinterpret_cast<unsigned int*>(&value));
+  return compare;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long is 64-bit");
+  Impl::sycl_atomic_ref<unsigned long long int, MemoryOrder, MemoryScope> dest_ref(
+      *reinterpret_cast<unsigned long long int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
+                                   *reinterpret_cast<unsigned long long int*>(&value));
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(T* const dest,
+                                                                 T value,
+                                                                 MemoryOrder,
+                                                                 MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  Impl::sycl_atomic_ref<unsigned int, MemoryOrder, MemoryScope> dest_ref(
+      *reinterpret_cast<unsigned int*>(dest));
+  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(T* const dest,
+                                                                 T value,
+                                                                 MemoryOrder,
+                                                                 MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long is 64-bit");
+  Impl::sycl_atomic_ref<unsigned long long int, MemoryOrder, MemoryScope> dest_ref(
+      *reinterpret_cast<unsigned long long int*>(dest));
+  unsigned long long int return_val =
+      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+atomic_compare_exchange(
+    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return value;
+}
+
+}  // namespace desul
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_ScopeCaller.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_ScopeCaller.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_ScopeCaller.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_ScopeCaller.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_Serial.hpp
similarity index 83%
rename from packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_Serial.hpp
index be7b46d5f..9d0db5c9e 100644
--- a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_Serial.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -10,13 +10,11 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #ifdef DESUL_HAVE_SERIAL_ATOMICS
 namespace desul {
-template<class MemoryScope>
-void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {
-}
+template <class MemoryScope>
+void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {}
 
-template<class MemoryScope>
-void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {
-}
+template <class MemoryScope>
+void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {}
 
 template <typename T, class MemoryScope>
 T atomic_compare_exchange(
diff --git a/packages/kokkos/core/src/desul/atomics/GCC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/GCC.hpp
similarity index 90%
rename from packages/kokkos/core/src/desul/atomics/GCC.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/GCC.hpp
index cd0c2bea1..239c84fd3 100644
--- a/packages/kokkos/core/src/desul/atomics/GCC.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/GCC.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -10,7 +10,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #ifdef DESUL_HAVE_GCC_ATOMICS
 
-#include<type_traits>
+#include <type_traits>
 /*
 Built - in Function : type __atomic_add_fetch(type * ptr, type val, int memorder)
 Built - in Function : type __atomic_sub_fetch(type * ptr, type val, int memorder)
@@ -91,18 +91,20 @@ DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeDevice)
 DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeCore)
 
 template <typename T, class MemoryOrder, class MemoryScope>
-std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
-atomic_exchange(T* const dest,
-                  Impl::dont_deduce_this_parameter_t<const T> val,
-                  MemoryOrder /*order*/,
-                  MemoryScope scope) {
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T> atomic_exchange(
+    T* const dest,
+    Impl::dont_deduce_this_parameter_t<const T> val,
+    MemoryOrder /*order*/,
+    MemoryScope scope) {
   // Acquire a lock for the address
+  // clang-format off
   while (!Impl::lock_address((void*)dest, scope)) {}
+  // clang-format on
 
-  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  atomic_thread_fence(MemoryOrderAcquire(), scope);
   T return_val = *dest;
   *dest = val;
-  atomic_thread_fence(MemoryOrderRelease(),scope);
+  atomic_thread_fence(MemoryOrderRelease(), scope);
   Impl::unlock_address((void*)dest, scope);
   return return_val;
 }
@@ -110,18 +112,20 @@ atomic_exchange(T* const dest,
 template <typename T, class MemoryOrder, class MemoryScope>
 std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
 atomic_compare_exchange(T* const dest,
-                  Impl::dont_deduce_this_parameter_t<const T> compare,
-                  Impl::dont_deduce_this_parameter_t<const T> val,
-                  MemoryOrder /*order*/,
-                  MemoryScope scope) {
+                        Impl::dont_deduce_this_parameter_t<const T> compare,
+                        Impl::dont_deduce_this_parameter_t<const T> val,
+                        MemoryOrder /*order*/,
+                        MemoryScope scope) {
   // Acquire a lock for the address
+  // clang-format off
   while (!Impl::lock_address((void*)dest, scope)) {}
+  // clang-format on
 
-  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  atomic_thread_fence(MemoryOrderAcquire(), scope);
   T return_val = *dest;
-  if(return_val == compare) {
+  if (return_val == compare) {
     *dest = val;
-    atomic_thread_fence(MemoryOrderRelease(),scope);
+    atomic_thread_fence(MemoryOrderRelease(), scope);
   }
   Impl::unlock_address((void*)dest, scope);
   return return_val;
diff --git a/packages/kokkos/core/src/desul/atomics/Generic.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/Generic.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/HIP.hpp
similarity index 99%
rename from packages/kokkos/core/src/desul/atomics/HIP.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/HIP.hpp
index 5365ab913..e51406e54 100644
--- a/packages/kokkos/core/src/desul/atomics/HIP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/HIP.hpp
@@ -109,7 +109,6 @@ DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int)
 #undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL
 #undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP
 
-
 // 2/ host-side fallback implementation for atomic functions not provided by GCC
 
 #define DESUL_IMPL_HIP_HOST_FALLBACK_ATOMIC_FUN(OP_LOWERCASE, OP_PASCAL_CASE, TYPE) \
@@ -177,7 +176,6 @@ DESUL_IMPL_HIP_HOST_FALLBACK_ATOMIC_INCREMENT_DECREMENT(unsigned long long)
 
 #undef DESUL_IMPL_HIP_HOST_FALLBACK_ATOMIC_INCREMENT_DECREMENT
 
-
 // 3/ device-side fallback implementation for atomic functions defined in GCC overload
 // set
 
@@ -222,4 +220,3 @@ DESUL_IMPL_HIP_DEVICE_FALLBACK_ATOMIC_FUN(nand, Nand)
 
 #endif
 #endif
-
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
similarity index 81%
rename from packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
index 8fd0e8bbd..6b2d4e74b 100644
--- a/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array.hpp
@@ -57,17 +57,14 @@ inline void finalize_lock_arrays() {
 }
 template <typename MemoryScope>
 inline bool lock_address(void* ptr, MemoryScope ms) {
-  return 0 == atomic_exchange(host_locks__::get_host_lock_(ptr),
-                                      int32_t(1),
-                                      MemoryOrderSeqCst(),
-                                      ms);
+  return 0 ==
+         atomic_exchange(
+             host_locks__::get_host_lock_(ptr), int32_t(1), MemoryOrderSeqCst(), ms);
 }
 template <typename MemoryScope>
 void unlock_address(void* ptr, MemoryScope ms) {
-  (void)atomic_exchange(host_locks__::get_host_lock_(ptr),
-                                int32_t(0),
-                                MemoryOrderSeqCst(),
-                                ms);
+  (void)atomic_exchange(
+      host_locks__::get_host_lock_(ptr), int32_t(0), MemoryOrderSeqCst(), ms);
 }
 }  // namespace Impl
 }  // namespace desul
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
similarity index 75%
rename from packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
index de9918534..2166fa3cb 100644
--- a/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -9,8 +9,8 @@ SPDX-License-Identifier: (BSD-3-Clause)
 #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
 #define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
 
-#include "desul/atomics/Macros.hpp"
 #include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
 
 #ifdef DESUL_HAVE_CUDA_ATOMICS
 
@@ -23,7 +23,7 @@ namespace Impl {
 #define DESUL_IMPL_BALLOT_MASK(m, x) __ballot_sync(m, x)
 #define DESUL_IMPL_ACTIVEMASK __activemask()
 #else
-#define DESUL_IMPL_BALLOT_MASK(m, x) m==0?0:1
+#define DESUL_IMPL_BALLOT_MASK(m, x) m == 0 ? 0 : 1
 #define DESUL_IMPL_ACTIVEMASK 0
 #endif
 
@@ -32,14 +32,13 @@ namespace Impl {
 extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h;
 extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h;
 
-
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
 ///        valid, initialized arrays.
 ///
 /// This call is idempotent.
 /// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
 ///   snapshotted version while also linking against pure Desul
-template<typename /*AlwaysInt*/ = int>
+template <typename /*AlwaysInt*/ = int>
 void init_lock_arrays_cuda();
 
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
@@ -47,8 +46,8 @@ void init_lock_arrays_cuda();
 ///
 /// This call is idempotent.
 /// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
-///   snappshotted version while also linking against pure Desul
-template<typename T = int>
+///   snapshotted version while also linking against pure Desul
+template <typename /*AlwaysInt*/ = int>
 void finalize_lock_arrays_cuda();
 
 }  // namespace Impl
@@ -77,7 +76,7 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
 __device__
 #ifdef __CUDACC_RDC__
     __constant__ extern
@@ -139,34 +138,42 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
-}  // namespace Impl
-}  // namespace desul
-/* It is critical that this code be a macro, so that it will
-   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
-   putting this in an inline function will NOT do the right thing! */
-#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
-  {                                                                        \
-    if (::desul::Impl::lock_array_copied == 0) {                           \
-      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
-                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
-                         sizeof(int32_t*));                                \
-      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,    \
-                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
-                         sizeof(int32_t*));                                \
-    }                                                                      \
-    ::desul::Impl::lock_array_copied = 1;                                  \
+
+#ifdef __CUDACC_RDC__
+inline
+#else
+static
+#endif
+    void
+    copy_cuda_lock_arrays_to_device() {
+  if (lock_array_copied == 0) {
+    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
+                       &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                       sizeof(int32_t*));
+    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
+                       &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                       sizeof(int32_t*));
   }
+  lock_array_copied = 1;
+}
 
+}  // namespace Impl
+}  // namespace desul
 
 #endif /* defined( __CUDACC__ ) */
 
-#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
+
+namespace desul {
 
 #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
-#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+inline void ensure_cuda_lock_arrays_on_device() {}
 #else
-#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+static inline void ensure_cuda_lock_arrays_on_device() {
+  Impl::copy_cuda_lock_arrays_to_device();
+}
 #endif
 
-#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
+}  // namespace desul
+
+#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
similarity index 82%
rename from packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
index 9e6f5e598..7c843f23c 100644
--- a/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_HIP.hpp
@@ -6,8 +6,8 @@ Source: https://github.com/desul/desul
 SPDX-License-Identifier: (BSD-3-Clause)
 */
 
-#ifndef DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
-#define DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_HIP_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_HIP_HPP_
 
 #include "desul/atomics/Common.hpp"
 #include "desul/atomics/Macros.hpp"
@@ -23,6 +23,8 @@ namespace Impl {
 
 #ifdef __HIP_DEVICE_COMPILE__
 #define DESUL_IMPL_BALLOT_MASK(x) __ballot(x)
+#else
+#define DESUL_IMPL_BALLOT_MASK(x) 0
 #endif
 
 /**
@@ -37,8 +39,8 @@ extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
 ///
 /// This call is idempotent.
 /// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
-///   snappshotted version while also linking against pure Desul
-template<typename T = int>
+///   snapshotted version while also linking against pure Desul
+template <typename /*AlwaysInt*/ = int>
 void init_lock_arrays_hip();
 
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
@@ -46,8 +48,8 @@ void init_lock_arrays_hip();
 ///
 /// This call is idempotent.
 /// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
-///   snappshotted version while also linking against pure Desul
-template<typename T = int>
+///   snapshotted version while also linking against pure Desul
+template <typename /*AlwaysInt*/ = int>
 void finalize_lock_arrays_hip();
 }  // namespace Impl
 }  // namespace desul
@@ -145,17 +147,18 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 /* It is critical that this code be a macro, so that it will
    capture the right address for g_device_hip_lock_arrays!
    putting this in an inline function will NOT do the right thing! */
-#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                               \
-  {                                                                               \
-    if (::desul::Impl::lock_array_copied == 0) {                                  \
-      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \
-                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,          \
-                        sizeof(int32_t*));                                        \
-      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE),   \
-                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,            \
-                        sizeof(int32_t*));                                        \
-    }                                                                             \
-    ::desul::Impl::lock_array_copied = 1;                                         \
+#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                                   \
+  {                                                                                   \
+    if (::desul::Impl::lock_array_copied == 0) {                                      \
+      (void)hipMemcpyToSymbol(                                                        \
+          HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE),                   \
+          &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,                            \
+          sizeof(int32_t*));                                                          \
+      (void)hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE), \
+                              &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,          \
+                              sizeof(int32_t*));                                      \
+    }                                                                                 \
+    ::desul::Impl::lock_array_copied = 1;                                             \
   }
 
 #endif
diff --git a/packages/kokkos/core/src/desul/atomics/Macros.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/Macros.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/OpenMP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/OpenMP.hpp
similarity index 87%
rename from packages/kokkos/core/src/desul/atomics/OpenMP.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/OpenMP.hpp
index 3fa22c36a..bc6fb26c2 100644
--- a/packages/kokkos/core/src/desul/atomics/OpenMP.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/OpenMP.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -10,6 +10,6 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #ifdef DESUL_HAVE_OPENMP_ATOMICS
 
-#include<desul/atomics/openmp/OpenMP_40.hpp>
+#include <desul/atomics/openmp/OpenMP_40.hpp>
 #endif
 #endif
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/SYCL.hpp
new file mode 100644
index 000000000..da34564f6
--- /dev/null
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/SYCL.hpp
@@ -0,0 +1,64 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_SYCL_HPP_
+#define DESUL_ATOMICS_SYCL_HPP_
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+// clang-format off
+#include "desul/atomics/SYCLConversions.hpp"
+#include "desul/atomics/Common.hpp"
+// clang-format on
+
+namespace desul {
+
+#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, TYPE)                              \
+  template <class MemoryOrder>                                                     \
+  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeDevice) { \
+    Impl::sycl_atomic_ref<TYPE, MemoryOrder, MemoryScopeDevice> dest_ref(*dest);   \
+    return dest_ref.fetch_##OPER(val);                                             \
+  }                                                                                \
+  template <class MemoryOrder>                                                     \
+  TYPE atomic_fetch_##OPER(TYPE* dest, TYPE val, MemoryOrder, MemoryScopeCore) {   \
+    Impl::sycl_atomic_ref<TYPE, MemoryOrder, MemoryScopeCore> dest_ref(*dest);     \
+    return dest_ref.fetch_##OPER(val);                                             \
+  }
+
+#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(OPER) \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, int)           \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned int)  \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, long)          \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned long) \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, long long)     \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, unsigned long long)
+
+#define DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(OPER) \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, float)               \
+  DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER(OPER, double)
+
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(add)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(sub)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(and)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(or)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(xor)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(min)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL(max)
+
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(add)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(sub)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(min)
+DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT(max)
+
+#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_FLOATING_POINT
+#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER_INTEGRAL
+#undef DESUL_IMPL_SYCL_ATOMIC_FETCH_OPER
+
+}  // namespace desul
+
+#endif  // DESUL_HAVE_SYCL_ATOMICS
+#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/SYCLConversions.hpp
similarity index 79%
rename from packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/SYCLConversions.hpp
index 0ff3c7fee..7debf91d6 100644
--- a/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/SYCLConversions.hpp
@@ -80,16 +80,20 @@ struct DesulToSYCLMemoryScope<MemoryScopeSystem, extended_namespace> {
       sycl_memory_scope<extended_namespace>::system;
 };
 
-template <class T,
-          class MemoryOrder,
-          class MemoryScope,
-          sycl::access::address_space AddressSpace>
-using sycl_atomic_ref =
-    sycl::ext::oneapi::atomic_ref<T,
-                                  DesulToSYCLMemoryOrder<MemoryOrder>::value,
-                                  DesulToSYCLMemoryScope<MemoryScope>::value,
-                                  AddressSpace>;
-
+// FIXME_SYCL generic_space isn't available yet for CUDA.
+#ifdef __NVPTX__
+template <class T, class MemoryOrder, class MemoryScope>
+using sycl_atomic_ref = sycl::atomic_ref<T,
+                                         DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                         DesulToSYCLMemoryScope<MemoryScope>::value,
+                                         sycl::access::address_space::global_space>;
+#else
+template <class T, class MemoryOrder, class MemoryScope>
+using sycl_atomic_ref = sycl::atomic_ref<T,
+                                         DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                         DesulToSYCLMemoryScope<MemoryScope>::value,
+                                         sycl::access::address_space::generic_space>;
+#endif
 }  // namespace Impl
 }  // namespace desul
 
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm_exchange.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/CUDA_asm_exchange.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc b/packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/openmp/OpenMP_40.hpp
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
rename to packages/kokkos/tpls/desul/include/desul/atomics/openmp/OpenMP_40.hpp
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc b/packages/kokkos/tpls/desul/include/desul/atomics/openmp/OpenMP_40_op.inc
similarity index 100%
rename from packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
rename to packages/kokkos/tpls/desul/include/desul/atomics/openmp/OpenMP_40_op.inc
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
similarity index 79%
rename from packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
rename to packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
index 8913f8bc7..19944b378 100644
--- a/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2019, Lawrence Livermore National Security, LLC
 and DESUL project contributors. See the COPYRIGHT file for details.
 Source: https://github.com/desul/desul
@@ -6,10 +6,10 @@ Source: https://github.com/desul/desul
 SPDX-License-Identifier: (BSD-3-Clause)
 */
 
-#include <desul/atomics/Lock_Array.hpp>
 #include <cinttypes>
-#include <string>
+#include <desul/atomics/Lock_Array.hpp>
 #include <sstream>
+#include <string>
 
 #ifdef DESUL_HAVE_CUDA_ATOMICS
 #ifdef __CUDACC_RDC__
@@ -17,7 +17,7 @@ namespace desul {
 namespace Impl {
 __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
 __device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE = nullptr;
-}
+}  // namespace Impl
 }  // namespace desul
 #endif
 
@@ -37,7 +37,6 @@ __global__ void init_lock_arrays_cuda_kernel() {
 
 namespace Impl {
 
-
 int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
 int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 
@@ -46,37 +45,39 @@ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 namespace {
 
 void check_error_and_throw_cuda(cudaError e, const std::string msg) {
-  if(e != cudaSuccess) {
+  if (e != cudaSuccess) {
     std::ostringstream out;
     out << "Desul::Error: " << msg << " error(" << cudaGetErrorName(e)
-                  << "): " << cudaGetErrorString(e);
+        << "): " << cudaGetErrorString(e);
     throw std::runtime_error(out.str());
   }
 }
 
-}
+}  // namespace
 
 // define functions
-template<typename T>
+template <typename T>
 void init_lock_arrays_cuda() {
   if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
   auto error_malloc1 = cudaMalloc(&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
-                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
-  check_error_and_throw_cuda(error_malloc1, "init_lock_arrays_cuda: cudaMalloc device locks");
+                                  sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc1,
+                             "init_lock_arrays_cuda: cudaMalloc device locks");
 
   auto error_malloc2 = cudaMallocHost(&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
-                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
-  check_error_and_throw_cuda(error_malloc2, "init_lock_arrays_cuda: cudaMalloc host locks");
+                                      sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc2,
+                             "init_lock_arrays_cuda: cudaMalloc host locks");
 
   auto error_sync1 = cudaDeviceSynchronize();
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
   check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
   init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
   auto error_sync2 = cudaDeviceSynchronize();
   check_error_and_throw_cuda(error_sync2, "init_lock_arrays_cuda: post init kernel");
 }
 
-template<typename T>
+template <typename T>
 void finalize_lock_arrays_cuda() {
   if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
   cudaFree(CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h);
@@ -84,7 +85,7 @@ void finalize_lock_arrays_cuda() {
   CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 #ifdef __CUDACC_RDC__
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
 #endif
 }
 
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
similarity index 83%
rename from packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
rename to packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
index 40030df64..5ccc6f7d5 100644
--- a/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_HIP.cpp
@@ -8,8 +8,8 @@ SPDX-License-Identifier: (BSD-3-Clause)
 
 #include <cinttypes>
 #include <desul/atomics/Lock_Array.hpp>
-#include <string>
 #include <sstream>
+#include <string>
 
 #ifdef DESUL_HAVE_HIP_ATOMICS
 #ifdef DESUL_HIP_RDC
@@ -45,27 +45,29 @@ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 namespace {
 
 void check_error_and_throw_hip(hipError_t e, const std::string msg) {
-  if(e != hipSuccess) {
+  if (e != hipSuccess) {
     std::ostringstream out;
     out << "Desul::Error: " << msg << " error(" << hipGetErrorName(e)
-                  << "): " << hipGetErrorString(e);
+        << "): " << hipGetErrorString(e);
     throw std::runtime_error(out.str());
   }
 }
 
-}
+}  // namespace
 
-template<typename T>
+template <typename T>
 void init_lock_arrays_hip() {
   if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
 
   auto error_malloc1 = hipMalloc(&HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
-            sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
-  check_error_and_throw_hip(error_malloc1, "init_lock_arrays_hip: hipMalloc device locks");
+                                 sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc1,
+                            "init_lock_arrays_hip: hipMalloc device locks");
 
   auto error_malloc2 = hipHostMalloc(&HIP_SPACE_ATOMIC_LOCKS_NODE_h,
-                sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
-  check_error_and_throw_hip(error_malloc2, "init_lock_arrays_hip: hipMallocHost host locks");
+                                     sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc2,
+                            "init_lock_arrays_hip: hipMallocHost host locks");
 
   auto error_sync1 = hipDeviceSynchronize();
   DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
@@ -77,7 +79,7 @@ void init_lock_arrays_hip() {
   check_error_and_throw_hip(error_sync2, "init_lock_arrays_hip: post init");
 }
 
-template<typename T>
+template <typename T>
 void finalize_lock_arrays_hip() {
   if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
   auto error_free1 = hipFree(HIP_SPACE_ATOMIC_LOCKS_DEVICE_h);
@@ -98,4 +100,3 @@ template void finalize_lock_arrays_hip<int>();
 
 }  // namespace desul
 #endif
-
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
index f4daf6b01..9f340e014 100644
--- a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
+++ b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
@@ -4278,37 +4278,6 @@ void ReportInvalidTestSuiteType(const char* test_suite_name,
 }
 }  // namespace internal
 
-namespace {
-
-// A predicate that checks the test name of a TestInfo against a known
-// value.
-//
-// This is used for implementation of the TestSuite class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestNameIs is copyable.
-class TestNameIs {
- public:
-  // Constructor.
-  //
-  // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
-#if defined(__EDG__)
-#pragma diag_suppress declared_but_not_referenced
-#endif
-  // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
-    return test_info && test_info->name() == name_;
-  }
-
- private:
-  std::string name_;
-};
-
-}  // namespace
-
 namespace internal {
 
 // This method expands all parameterized tests registered with macros TEST_P
diff --git a/src/utils/Array.hpp b/src/utils/Array.hpp
index d641dbdef..7535b6735 100644
--- a/src/utils/Array.hpp
+++ b/src/utils/Array.hpp
@@ -8,7 +8,6 @@
 #include <utils/PugsUtils.hpp>
 #include <utils/Types.hpp>
 
-#include <Kokkos_CopyViews.hpp>
 #include <algorithm>
 
 template <typename DataType>
@@ -25,18 +24,21 @@ class [[nodiscard]] Array
     const size_t m_size;
 
    public:
-    [[nodiscard]] PUGS_INLINE size_t size() const
+    [[nodiscard]] PUGS_INLINE size_t
+    size() const
     {
       return m_size;
     }
 
-    [[nodiscard]] PUGS_INLINE DataType& operator[](size_t i) const
+    [[nodiscard]] PUGS_INLINE DataType&
+    operator[](size_t i) const
     {
       Assert(i < m_size, "invalid index");
       return m_values[i];
     }
 
-    PUGS_INLINE void fill(const DataType& data) const
+    PUGS_INLINE void
+    fill(const DataType& data) const
     {
       for (size_t i = 0; i < m_size; ++i) {
         m_values[i] = data;
@@ -44,7 +46,7 @@ class [[nodiscard]] Array
     }
 
     UnsafeArrayView& operator=(const UnsafeArrayView&) = delete;
-    UnsafeArrayView& operator=(UnsafeArrayView&&) = delete;
+    UnsafeArrayView& operator=(UnsafeArrayView&&)      = delete;
 
     UnsafeArrayView(const Array<DataType>& array, index_type begin, index_type size)
       : m_values{&array[begin]}, m_size{size}
@@ -68,12 +70,14 @@ class [[nodiscard]] Array
   friend Array<std::add_const_t<DataType>>;
 
  public:
-  [[nodiscard]] PUGS_INLINE size_t size() const noexcept
+  [[nodiscard]] PUGS_INLINE size_t
+  size() const noexcept
   {
     return m_values.extent(0);
   }
 
-  [[nodiscard]] friend PUGS_INLINE Array<std::remove_const_t<DataType>> copy(const Array<DataType>& source)
+  [[nodiscard]] friend PUGS_INLINE Array<std::remove_const_t<DataType>>
+  copy(const Array<DataType>& source)
   {
     Array<std::remove_const_t<DataType>> image(source.size());
     Kokkos::deep_copy(image.m_values, source.m_values);
@@ -81,8 +85,8 @@ class [[nodiscard]] Array
     return image;
   }
 
-  friend PUGS_INLINE void copy_to(const Array<DataType>& source,
-                                  const Array<std::remove_const_t<DataType>>& destination)
+  friend PUGS_INLINE void
+  copy_to(const Array<DataType>& source, const Array<std::remove_const_t<DataType>>& destination)
   {
     Assert(source.size() == destination.size(), "incompatible Array sizes");
     Kokkos::deep_copy(destination.m_values, source.m_values);
@@ -96,14 +100,16 @@ class [[nodiscard]] Array
                                                                       typename Array<DataType2>::index_type begin,
                                                                       typename Array<DataType2>::index_type size);
 
-  [[nodiscard]] PUGS_INLINE DataType& operator[](index_type i) const noexcept(NO_ASSERT)
+  [[nodiscard]] PUGS_INLINE DataType&
+  operator[](index_type i) const noexcept(NO_ASSERT)
   {
     Assert(i < m_values.extent(0), "invalid index");
     return m_values[i];
   }
 
   PUGS_INLINE
-  void fill(const DataType& data) const
+  void
+  fill(const DataType& data) const
   {
     static_assert(not std::is_const_v<DataType>, "Cannot modify Array of const");
 
@@ -111,7 +117,8 @@ class [[nodiscard]] Array
   }
 
   template <typename DataType2>
-  PUGS_INLINE Array& operator=(const Array<DataType2>& array) noexcept
+  PUGS_INLINE Array&
+  operator=(const Array<DataType2>& array) noexcept
   {
     // ensures that DataType is the same as source DataType2
     static_assert(std::is_same<std::remove_const_t<DataType>, std::remove_const_t<DataType2>>(),
@@ -154,7 +161,8 @@ class [[nodiscard]] Array
 #endif   // NDEBUG
   }
 
-  friend std::ostream& operator<<(std::ostream& os, const Array& x)
+  friend std::ostream&
+  operator<<(std::ostream& os, const Array& x)
   {
     if (x.size() > 0) {
       os << 0 << ':' << NaNHelper(x[0]);
@@ -172,13 +180,14 @@ class [[nodiscard]] Array
   Array(const Array&) = default;
 
   template <typename DataType2>
-  PUGS_INLINE Array(const Array<DataType2>& array) noexcept
+  PUGS_INLINE
+  Array(const Array<DataType2>& array) noexcept
   {
     this->operator=(array);
   }
 
   PUGS_INLINE
-  Array(Array &&) = default;
+  Array(Array&&) = default;
 
   PUGS_INLINE
   ~Array() = default;
diff --git a/src/utils/PugsUtils.cpp b/src/utils/PugsUtils.cpp
index 705fbd5e0..76509c611 100644
--- a/src/utils/PugsUtils.cpp
+++ b/src/utils/PugsUtils.cpp
@@ -18,6 +18,7 @@
 #include <CLI/CLI.hpp>
 
 #include <iostream>
+#include <thread>
 
 std::string
 pugsVersion()
@@ -150,7 +151,8 @@ initialize(int& argc, char* argv[])
 #else    // PUGS_HAS_MPI
     std::cout << "Sequential build\n";
 #endif   // PUGS_HAS_MPI
-    Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
+    std::cout << "Number of threads " << Kokkos::DefaultHostExecutionSpace::concurrency() << " / "
+              << std::max(std::thread::hardware_concurrency(), 1u) << '\n';
     std::cout << rang::style::reset;
     std::cout << "-------------------------------------------------------\n";
   }
diff --git a/src/utils/Table.hpp b/src/utils/Table.hpp
index 6171f1a24..6972ea103 100644
--- a/src/utils/Table.hpp
+++ b/src/utils/Table.hpp
@@ -7,8 +7,6 @@
 #include <utils/PugsMacros.hpp>
 #include <utils/PugsUtils.hpp>
 
-#include <Kokkos_CopyViews.hpp>
-
 #include <iostream>
 
 template <typename DataType>
@@ -35,19 +33,22 @@ class [[nodiscard]] Table
     const size_t m_row;
 
    public:
-    PUGS_INLINE size_t size() const noexcept
+    PUGS_INLINE size_t
+    size() const noexcept
     {
       return m_table.numberOfColumns();
     }
 
     PUGS_INLINE
-    DataType& operator[](size_t i) const
+    DataType&
+    operator[](size_t i) const
     {
       Assert(i < m_table.numberOfColumns(), "invalid index");
       return m_table(m_row, i);
     }
 
-    PUGS_INLINE void fill(const DataType& data) const
+    PUGS_INLINE void
+    fill(const DataType& data) const
     {
       for (size_t i = 0; i < this->size(); ++i) {
         m_table(m_row, i) = data;
@@ -55,7 +56,7 @@ class [[nodiscard]] Table
     }
 
     UnsafeRowView& operator=(const UnsafeRowView&) = delete;
-    UnsafeRowView& operator=(UnsafeRowView&&) = delete;
+    UnsafeRowView& operator=(UnsafeRowView&&)      = delete;
 
     UnsafeRowView(const Table<DataType>& table, index_type row) : m_table{table}, m_row{row}
     {
@@ -91,18 +92,21 @@ class [[nodiscard]] Table
       const size_t m_row;
 
      public:
-      [[nodiscard]] PUGS_INLINE size_t size() const
+      [[nodiscard]] PUGS_INLINE size_t
+      size() const
       {
         return m_table_view.numberOfColumns();
       }
 
-      [[nodiscard]] PUGS_INLINE DataType& operator[](size_t i) const
+      [[nodiscard]] PUGS_INLINE DataType&
+      operator[](size_t i) const
       {
         Assert(i < m_table_view.numberOfColumns(), "invalid index");
         return m_table_view(m_row, i);
       }
 
-      PUGS_INLINE void fill(const DataType& data) const
+      PUGS_INLINE void
+      fill(const DataType& data) const
       {
         for (size_t i = 0; i < this->size(); ++i) {
           m_table_view(m_row, i) = data;
@@ -123,23 +127,27 @@ class [[nodiscard]] Table
       ~RowView() = default;
     };
 
-    [[nodiscard]] PUGS_INLINE size_t numberOfRows() const noexcept
+    [[nodiscard]] PUGS_INLINE size_t
+    numberOfRows() const noexcept
     {
       return m_row_size;
     }
 
-    [[nodiscard]] PUGS_INLINE size_t numberOfColumns() const noexcept
+    [[nodiscard]] PUGS_INLINE size_t
+    numberOfColumns() const noexcept
     {
       return m_column_size;
     }
 
-    [[nodiscard]] PUGS_INLINE RowView operator[](size_t i) const
+    [[nodiscard]] PUGS_INLINE RowView
+    operator[](size_t i) const
     {
       Assert(i < this->numberOfRows(), "invalid index");
       return RowView(*this, i);
     }
 
-    [[nodiscard]] PUGS_INLINE DataType& operator()(size_t i, size_t j) const
+    [[nodiscard]] PUGS_INLINE DataType&
+    operator()(size_t i, size_t j) const
     {
       Assert(i < m_row_size, "invalid row index");
       Assert(j < m_column_size, "invalid column index");
@@ -147,7 +155,8 @@ class [[nodiscard]] Table
       return m_table(m_row_begin + i, m_column_begin + j);
     }
 
-    PUGS_INLINE void fill(const DataType& data) const
+    PUGS_INLINE void
+    fill(const DataType& data) const
     {
       for (size_t i = 0; i < m_row_size; ++i) {
         for (size_t j = 0; j < m_column_size; ++j) {
@@ -156,9 +165,12 @@ class [[nodiscard]] Table
       }
     }
     UnsafeTableView& operator=(const UnsafeTableView&) = delete;
-    UnsafeTableView& operator=(UnsafeTableView&&) = delete;
+    UnsafeTableView& operator=(UnsafeTableView&&)      = delete;
 
-    UnsafeTableView(const Table<DataType>& table, index_type row_begin, index_type row_size, index_type column_begin,
+    UnsafeTableView(const Table<DataType>& table,
+                    index_type row_begin,
+                    index_type row_size,
+                    index_type column_begin,
                     index_type column_size)
       : m_table{table},
         m_row_begin{row_begin},
@@ -181,22 +193,26 @@ class [[nodiscard]] Table
     ~UnsafeTableView() = default;
   };
 
-  [[nodiscard]] PUGS_INLINE size_t numberOfRows() const noexcept
+  [[nodiscard]] PUGS_INLINE size_t
+  numberOfRows() const noexcept
   {
     return m_values.extent(0);
   }
 
-  [[nodiscard]] PUGS_INLINE size_t numberOfColumns() const noexcept
+  [[nodiscard]] PUGS_INLINE size_t
+  numberOfColumns() const noexcept
   {
     return m_values.extent(1);
   }
 
-  [[nodiscard]] PUGS_INLINE Table<DataType>::UnsafeRowView operator[](index_type i) const
+  [[nodiscard]] PUGS_INLINE Table<DataType>::UnsafeRowView
+  operator[](index_type i) const
   {
     return UnsafeRowView(*this, i);
   }
 
-  [[nodiscard]] friend PUGS_INLINE Table<std::remove_const_t<DataType>> copy(const Table<DataType>& source)
+  [[nodiscard]] friend PUGS_INLINE Table<std::remove_const_t<DataType>>
+  copy(const Table<DataType>& source)
   {
     Table<std::remove_const_t<DataType>> image(source.numberOfRows(), source.numberOfColumns());
     Kokkos::deep_copy(image.m_values, source.m_values);
@@ -204,8 +220,8 @@ class [[nodiscard]] Table
     return image;
   }
 
-  friend PUGS_INLINE void copy_to(const Table<DataType>& source,
-                                  const Table<std::remove_const_t<DataType>>& destination)
+  friend PUGS_INLINE void
+  copy_to(const Table<DataType>& source, const Table<std::remove_const_t<DataType>>& destination)
   {
     Assert(source.numberOfRows() == destination.numberOfRows(), "incompatible number of rows");
     Assert(source.numberOfColumns() == destination.numberOfColumns(), "incompatible number of columns");
@@ -216,12 +232,15 @@ class [[nodiscard]] Table
   friend PUGS_INLINE Table<DataType2> encapsulate(const Kokkos::View<DataType2**, RT...>& values);
 
   template <typename DataType2>
-  friend PUGS_INLINE typename Table<DataType2>::UnsafeTableView
-  subTableView(const Table<DataType2>& table, typename Table<DataType2>::index_type row_begin,
-               typename Table<DataType2>::index_type row_size, typename Table<DataType2>::index_type column_begin,
-               typename Table<DataType2>::index_type column_size);
-
-  [[nodiscard]] PUGS_INLINE DataType& operator()(index_type i, index_type j) const noexcept(NO_ASSERT)
+  friend PUGS_INLINE typename Table<DataType2>::UnsafeTableView subTableView(
+    const Table<DataType2>& table,
+    typename Table<DataType2>::index_type row_begin,
+    typename Table<DataType2>::index_type row_size,
+    typename Table<DataType2>::index_type column_begin,
+    typename Table<DataType2>::index_type column_size);
+
+  [[nodiscard]] PUGS_INLINE DataType&
+  operator()(index_type i, index_type j) const noexcept(NO_ASSERT)
   {
     Assert(i < this->numberOfRows(), "invalid row index");
     Assert(j < this->numberOfColumns(), "invalid column index");
@@ -229,7 +248,8 @@ class [[nodiscard]] Table
   }
 
   PUGS_INLINE
-  void fill(const DataType& data) const
+  void
+  fill(const DataType& data) const
   {
     static_assert(not std::is_const<DataType>(), "Cannot modify Table of const");
 
@@ -237,7 +257,8 @@ class [[nodiscard]] Table
   }
 
   template <typename DataType2>
-  PUGS_INLINE Table& operator=(const Table<DataType2>& table) noexcept
+  PUGS_INLINE Table&
+  operator=(const Table<DataType2>& table) noexcept
   {
     // ensures that DataType is the same as source DataType2
     static_assert(std::is_same<std::remove_const_t<DataType>, std::remove_const_t<DataType2>>(),
@@ -281,7 +302,8 @@ class [[nodiscard]] Table
 #endif   // NDEBUG
   }
 
-  friend std::ostream& operator<<(std::ostream& os, const Table& t)
+  friend std::ostream&
+  operator<<(std::ostream& os, const Table& t)
   {
     for (size_t i = 0; i < t.numberOfRows(); ++i) {
       os << i << '|';
@@ -300,13 +322,14 @@ class [[nodiscard]] Table
   Table(const Table&) = default;
 
   template <typename DataType2>
-  PUGS_INLINE Table(const Table<DataType2>& table) noexcept
+  PUGS_INLINE
+  Table(const Table<DataType2>& table) noexcept
   {
     this->operator=(table);
   }
 
   PUGS_INLINE
-  Table(Table &&) = default;
+  Table(Table&&) = default;
 
   PUGS_INLINE
   ~Table() = default;
diff --git a/tests/mpi_test_main.cpp b/tests/mpi_test_main.cpp
index f2a2fe63a..eeaf00129 100644
--- a/tests/mpi_test_main.cpp
+++ b/tests/mpi_test_main.cpp
@@ -28,7 +28,15 @@ main(int argc, char* argv[])
   const int nb_max_threads = std::max(std::thread::hardware_concurrency(), 1u);
   const int nb_threads     = std::max(nb_max_threads / parallel::Messenger::getInstance().size(), 1ul);
 
-  Kokkos::initialize({nb_threads, -1, -1, true});
+  {
+    Kokkos::InitArguments args;
+    args.num_threads      = nb_threads;
+    args.num_numa         = -1;
+    args.device_id        = -1;
+    args.disable_warnings = true;
+
+    Kokkos::initialize(args);
+  }
 
   PETScWrapper::initialize(argc, argv);
 
diff --git a/tests/test_main.cpp b/tests/test_main.cpp
index 3eff2f311..0dcf46cfe 100644
--- a/tests/test_main.cpp
+++ b/tests/test_main.cpp
@@ -23,7 +23,15 @@ main(int argc, char* argv[])
   parallel::Messenger::create(argc, argv);
   const int nb_threads = std::max(std::thread::hardware_concurrency(), 1u);
 
-  Kokkos::initialize({nb_threads, -1, -1, true});
+  {
+    Kokkos::InitArguments args;
+    args.num_threads      = nb_threads;
+    args.num_numa         = -1;
+    args.device_id        = -1;
+    args.disable_warnings = true;
+
+    Kokkos::initialize(args);
+  }
 
   PETScWrapper::initialize(argc, argv);
   SLEPcWrapper::initialize(argc, argv);
-- 
GitLab