diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index b2b4bfc3109d9aa10966b2866c472fa4d8457c5f..4ae42bc93ff649401818962ee7e4f16c2c468cb2 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -10,7 +10,7 @@ jobs:
     continue-on-error: true
     strategy:
       matrix:
-        distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest']
+        distro: ['fedora:latest', 'ubuntu:latest']
         cxx: ['g++', 'clang++']
         cmake_build_type: ['Release', 'Debug']
         backend: ['OPENMP']
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index 9b53d527695cc0db4672b75762f449344c305baa..4c3eed6dba9d2682fd85d2185169ac1f5c8e17a6 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = 61d7db55fceac3318c987a291f77b844fd94c165
-	parent = 91d53e3cfb9a55832aae102ca677044a47f2515d
+	commit = 5ad609661e570ba6aa7716a26a91cb67d559f8a2
+	parent = db24c19be339723f3a10fffce3075fd72dccdfeb
 	method = merge
-	cmdver = 0.4.3
+	cmdver = 0.4.5
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index 09052840e6a7ea4115c1be9abb3f2aa2e644eabd..3025cb558eab0c10d6be328b199515807fbba03b 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -347,7 +347,7 @@ pipeline {
                         dockerfile {
                             filename 'Dockerfile.nvcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3'
+                            additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3'
                             label 'nvidia-docker'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
                         }
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index e81f2944519e1b39b31e1c9d7332b3aa6cb8d45e..bdbc75604bab5fbbfd436767531ab30371cc788b 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,27 @@
 # Change Log
 
+## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
+
+### Bug Fixes:
+- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
+- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
+- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
+- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
+- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
+- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
+- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
+- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
+- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
+- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
+- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
+- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
+- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
+- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
+- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
+- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
+- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
+
 ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
 
@@ -102,7 +124,6 @@
 - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
 - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
 - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
-- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
 - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
 - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
 - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index a05bfcdb94d53e0a7d453d62909e9a5686f6cc41..7b78f29d7340499aff394302911e59c5ef120d52 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -129,7 +129,7 @@ ENDIF()
 
 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 7)
-set(Kokkos_VERSION_PATCH 00)
+set(Kokkos_VERSION_PATCH 01)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -152,6 +152,7 @@ ENDIF()
 # but scoping issues can make it difficult
 GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
 GLOBAL_SET(KOKKOS_LINK_OPTIONS)
+GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
 GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
@@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
   # we have to match the annoying behavior, also we have to preserve quotes
   # which needs another workaround.
   SET(KOKKOS_COMPILE_OPTIONS_TMP)
+  IF (KOKKOS_ENABLE_HIP)
+    LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
+  ENDIF()
   FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
     STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
     IF(OPTION_HAS_WHITESPACE EQUAL -1)
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index d493abbf1421973a973e93775d90ef83e502e2cd..2e32c9d53893bf552381eb618db37acaf8156822 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -2,7 +2,7 @@
 
 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 7
-KOKKOS_VERSION_PATCH = 00
+KOKKOS_VERSION_PATCH = 01
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@@ -495,10 +495,6 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 
-#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
-  KOKKOS_LIBS += -latomic
-#endif
-
 # Generating the KokkosCore_config.h file.
 
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@@ -540,6 +536,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_LIBS += -latomic
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
   ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
diff --git a/packages/kokkos/algorithms/cmake/Dependencies.cmake b/packages/kokkos/algorithms/cmake/Dependencies.cmake
index 1b413106817cc6adf18dc94189203a27e641c6d5..c36b62523fadb628e970b6eccf57a9caaa317f1e 100644
--- a/packages/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/packages/kokkos/algorithms/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
   LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
   TEST_OPTIONAL_TPLS CUSPARSE
   )
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index ad0c2d47b6d20d2022b5c60e81e7268b53d47f13..c7be70e09a48eff3fb2bf3a7cb89be1cf1fe6664 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -265,8 +265,8 @@ class BinSort {
   //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count
   // array. Can be called again if keys changed
-  template <class ExecutionSpace = exec_space>
-  void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
+  template <class ExecutionSpace>
+  void create_permute_vector(const ExecutionSpace& exec) {
     static_assert(
         Kokkos::SpaceAccessibility<ExecutionSpace,
                                    typename Space::memory_space>::accessible,
@@ -297,6 +297,15 @@ class BinSort {
           *this);
   }
 
+  // Create the permutation vector, the bin_offset array and the bin_count
+  // array. Can be called again if keys changed
+  void create_permute_vector() {
+    Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
+    exec_space e{};
+    create_permute_vector(e);
+    e.fence("Kokkos::Binsort::create_permute_vector: after");
+  }
+
   // Sort a subset of a view with respect to the first dimension using the
   // permutation array
   template <class ExecutionSpace, class ValuesViewType>
@@ -372,9 +381,10 @@ class BinSort {
   template <class ValuesViewType>
   void sort(ValuesViewType const& values, int values_range_begin,
             int values_range_end) const {
+    Kokkos::fence("Kokkos::Binsort::sort: before");
     exec_space exec;
     sort(exec, values, values_range_begin, values_range_end);
-    exec.fence("Kokkos::Sort: fence after sorting");
+    exec.fence("Kokkos::BinSort:sort: after");
   }
 
   template <class ExecutionSpace, class ValuesViewType>
@@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
 
 template <class ViewType>
 void sort(ViewType const& view) {
+  Kokkos::fence("Kokkos::sort: before");
   typename ViewType::execution_space exec;
   sort(exec, view);
-  exec.fence("Kokkos::Sort: fence after sorting");
+  exec.fence("Kokkos::sort: fence after sorting");
 }
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
 
 template <class ViewType>
 void sort(ViewType view, size_t const begin, size_t const end) {
+  Kokkos::fence("Kokkos::sort: before");
   typename ViewType::execution_space exec;
   sort(exec, view, begin, end);
   exec.fence("Kokkos::Sort: fence after sorting");
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index 34807ac2b26228a4f0c10aa3ee5c4f7951ac235f..88ddc483786112ca0e70f418921d89ae102b9dde 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -66,6 +66,7 @@
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
+#cmakedefine KOKKOS_ARCH_A64FX
 #cmakedefine KOKKOS_ARCH_AMD_AVX2
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index d4c2cda651f3510bd66e9b8faff344ebf0cf666a..ef16aad047a96cfb31f3ae6c5ecaa93ff8175539 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -187,7 +187,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
 ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(CUDA_ARCH_FLAG "-gpu")
   GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
-  GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
+  IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
+    GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
+  ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   SET(CUDA_ARCH_FLAG "-arch")
 ENDIF()
diff --git a/packages/kokkos/containers/cmake/Dependencies.cmake b/packages/kokkos/containers/cmake/Dependencies.cmake
index 5e29157369c9ab8cab935a1bfc4c6dad2fdd0296..1d71d8af341181f689a6a8bf63036b67584cb138 100644
--- a/packages/kokkos/containers/cmake/Dependencies.cmake
+++ b/packages/kokkos/containers/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
   LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
   TEST_OPTIONAL_TPLS CUSPARSE
   )
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index 442f0d8617524dc0c1459bf10110891e97b3a6b2..059ce8a610d26c9072b1cd15a282364855130d9a 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -1701,7 +1701,11 @@ namespace Impl {
    underlying memory, to facilitate implementation of deep_copy() and
    other routines that are defined on View */
 template <unsigned N, typename T, typename... Args>
-KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
+KOKKOS_FUNCTION auto as_view_of_rank_n(
+    DynRankView<T, Args...> v,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
+        nullptr) {
   if (v.rank() != N) {
     KOKKOS_IF_ON_HOST(
         const std::string message =
@@ -2114,9 +2118,10 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    std::is_same<
-        typename DynRankView<T, P...>::memory_space,
-        typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        std::is_same<
+            typename DynRankView<T, P...>::memory_space,
+            typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
         std::is_same<
             typename DynRankView<T, P...>::data_type,
             typename DynRankView<T, P...>::HostMirror::data_type>::value,
@@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView<T, P...>& src,
 
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename DynRankView<T, P...>::memory_space,
-          typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename DynRankView<T, P...>::data_type,
-          typename DynRankView<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename DynRankView<T, P...>::memory_space,
+              typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename DynRankView<T, P...>::data_type,
+              typename DynRankView<T, P...>::HostMirror::data_type>::value),
     typename DynRankView<T, P...>::HostMirror>
 create_mirror_view(
     const DynRankView<T, P...>& src,
@@ -2141,29 +2147,39 @@ create_mirror_view(
   return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
-        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
+    Kokkos::is_space<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
+        Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+        P...>::view_type>
+create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
                    const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+    Kokkos::is_space<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
+        !Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+        P...>::view_type>
 create_mirror_view(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
+    const Kokkos::DynRankView<T, P...>& src,
     const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
-  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
-  alloc_prop prop_copy(arg_prop);
-
-  return Kokkos::Impl::create_mirror(src, prop_copy);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl
 
@@ -2224,9 +2240,10 @@ create_mirror_view(
 
 template <class Space, class T, class... P>
 inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
-                               const Space& space,
+                               const Space&,
                                const Kokkos::DynRankView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 
 template <class T, class... P, class... ViewCtorArgs>
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index 015a75cb0b02c602db2a3bded219497c3414595c..a2b68064de13bd2b8988b6a2025bc3c9ef2c2685 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -710,7 +710,7 @@ template <class Space, class T, class... P>
 inline auto create_mirror(
     const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
   return Impl::create_mirror(
-      src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 
 template <class Space, class T, class... P>
@@ -729,48 +729,68 @@ inline auto create_mirror(
 }
 
 namespace Impl {
+
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
-         typename Kokkos::Experimental::DynamicView<
-             T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<
-         typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
-         typename Kokkos::Experimental::DynamicView<
-             T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+             typename Kokkos::Experimental::DynamicView<
+                 T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+             typename Kokkos::Experimental::DynamicView<
+                 T, P...>::HostMirror::data_type>::value),
     typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
-create_mirror_view(
-    const typename Kokkos::Experimental::DynamicView<T, P...>& src,
-    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
-          typename Kokkos::Experimental::DynamicView<
-              T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
-          typename Kokkos::Experimental::DynamicView<
-              T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+              typename Kokkos::Experimental::DynamicView<
+                  T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+              typename Kokkos::Experimental::DynamicView<
+                  T, P...>::HostMirror::data_type>::value),
     typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
   return Kokkos::create_mirror(arg_prop, src);
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline std::enable_if_t<
-    Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
-    typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&,
-                   const Kokkos::Experimental::DynamicView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
+
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::Impl::create_mirror(src, arg_prop);
+}
 }  // namespace Impl
 
 // Create a mirror view in host space
@@ -790,8 +810,9 @@ inline auto create_mirror_view(
 // Create a mirror in a new space
 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(src,
+                                  view_alloc(typename Space::memory_space{}));
 }
 
 template <class Space, class T, class... P>
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 0b54d1bdd952f33e433f17b05c56ef415ee286b4..5027763a0297a00c2b9dfb28734da628e763d7dc 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
 
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
+inline std::enable_if_t<
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
+    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
               const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
   return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
       Kokkos::create_mirror(arg_prop, src.view()), src.begins());
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(const Space&,
-              const Kokkos::Experimental::OffsetView<T, P...>& src,
-              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
+                          const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
   using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
 
   static_assert(
       !alloc_prop_input::has_label,
@@ -1923,10 +1926,6 @@ create_mirror(const Space&,
       !alloc_prop_input::has_pointer,
       "The view constructor arguments passed to Kokkos::create_mirror must "
       "not include a pointer!");
-  static_assert(
-      !alloc_prop_input::has_memory_space,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not include a memory space instance!");
   static_assert(
       !alloc_prop_input::allow_padding,
       "The view constructor arguments passed to Kokkos::create_mirror must "
@@ -1962,15 +1961,17 @@ inline auto create_mirror(
 template <class Space, class T, class... P,
           typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 
 template <class Space, class T, class... P>
 typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
               const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 
 template <class T, class... P, class... ViewCtorArgs>
@@ -1983,54 +1984,64 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-         typename Kokkos::Experimental::OffsetView<
-             T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-                  typename Kokkos::Experimental::OffsetView<
-                      T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+             typename Kokkos::Experimental::OffsetView<
+                 T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+             typename Kokkos::Experimental::OffsetView<
+                 T, P...>::HostMirror::data_type>::value),
     typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
-create_mirror_view(
-    const typename Kokkos::Experimental::OffsetView<T, P...>& src,
-    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-          typename Kokkos::Experimental::OffsetView<
-              T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-          typename Kokkos::Experimental::OffsetView<
-              T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+              typename Kokkos::Experimental::OffsetView<
+                  T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+              typename Kokkos::Experimental::OffsetView<
+                  T, P...>::HostMirror::data_type>::value),
     typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
   return Kokkos::create_mirror(arg_prop, src);
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline std::enable_if_t<
-    Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
-    Kokkos::Experimental::OffsetView<T, P...>>
-create_mirror_view(const Space&,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<
-    !Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
-    typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space& space,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  return create_mirror(space, src, arg_prop);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl
 
@@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
 template <class Space, class T, class... P,
           typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 
 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+    Kokkos::Impl::WithoutInitializing_t wi, const Space&,
     const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 
 template <class T, class... P, class... ViewCtorArgs>
diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt
index f16572b60300562eabd01563ee2469cfa899bf65..261d9dcd4215d712ef7b6fca3b0ad08c9ecb0052 100644
--- a/packages/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt
@@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
     KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
   endif()
 endforeach()
+
+SET(COMPILE_ONLY_SOURCES
+  TestCreateMirror.cpp
+)
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  TestCompileMain.cpp
+  ${COMPILE_ONLY_SOURCES}
+)
diff --git a/packages/kokkos/containers/unit_tests/TestCompileMain.cpp b/packages/kokkos/containers/unit_tests/TestCompileMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..237c8ce181774d991a9dbdd8cacf1a5fb9f199f1
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestCompileMain.cpp
@@ -0,0 +1 @@
+int main() {}
diff --git a/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp b/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e43be4364154393b30cd349a563d7984a5ca2f0
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp
@@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynamicView.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <Kokkos_OffsetView.hpp>
+
+template <typename TestView, typename MemorySpace>
+void check_memory_space(TestView, MemorySpace) {
+  static_assert(
+      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
+}
+
+template <class View>
+auto host_mirror_test_space(View) {
+  return std::conditional_t<
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename View::memory_space>::accessible,
+      typename View::memory_space, Kokkos::HostSpace>{};
+}
+
+template <typename View>
+void test_create_mirror_properties(const View& view) {
+  using namespace Kokkos;
+  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
+
+  // clang-format off
+  
+  // create_mirror
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(WithoutInitializing,                        view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(                                            view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(WithoutInitializing,                        view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(                                            view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(view_alloc(),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(view_alloc(),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc + execution space
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc + execution space
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy
+  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc + execution space
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
+
+  // clang-format on
+}
+
+void test_create_mirror_dynrankview() {
+  Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
+      "device view", 10);
+  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
+
+void test_reate_mirror_offsetview() {
+  Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", {0, 10});
+  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
+      "host view", {0, 10});
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
+
+void test_create_mirror_dynamicview() {
+  Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", 2, 10);
+  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
+      "host view", 2, 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
diff --git a/packages/kokkos/core/cmake/Dependencies.cmake b/packages/kokkos/core/cmake/Dependencies.cmake
index cc901a4ede0c6b17fbb89bfa9edfaf6544d7b269..611c089b2e3feec2ec79228360f93c242fc055e2 100644
--- a/packages/kokkos/core/cmake/Dependencies.cmake
+++ b/packages/kokkos/core/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
   TEST_OPTIONAL_TPLS CUSPARSE
   )
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 88810b6fc2bbcf5c6fef3bd4a9de0a72fb30c5e8..b7a80ad84ff22b00d9666956cf5896b259d38b6a 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -636,7 +636,7 @@ struct CudaParallelLaunchImpl<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
           base_t::get_kernel_func(), prefer_shmem);
 
-      ensure_cuda_lock_arrays_on_device();
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
       // Invoke the driver function on the device
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index 3796534816a8bdb6bbeb1e517c6e54a04f2c82e1..84d4307cfd549f9567cb2bc5982a882543e19168 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
 void initialize_host_cuda_lock_arrays() {
 #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
   desul::Impl::init_lock_arrays();
-  desul::ensure_cuda_lock_arrays_on_device();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 #endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
   KOKKOS_IMPL_CUDA_SAFE_CALL(
@@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
   Impl::cuda_device_synchronize(
       "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   Impl::cuda_device_synchronize(
@@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
   g_host_cuda_lock_arrays.atomic = nullptr;
   g_host_cuda_lock_arrays.n      = 0;
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index 244f142f0d83550bf79d5cfb5288494e2629226f..bdb7723985e5a3c6c0451ada3d0b6b7303204089 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -67,7 +67,7 @@ struct CudaLockArrays {
 
 /// \brief This global variable in Host space is the central definition
 ///        of these arrays.
-extern CudaLockArrays g_host_cuda_lock_arrays;
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
 
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
 ///        valid, initialized arrays.
@@ -105,12 +105,12 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
     __constant__ extern
 #endif
-    CudaLockArrays g_device_cuda_lock_arrays;
+    Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
 
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 
@@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset        = offset >> 2;
   offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
+  return (
+      0 ==
+      atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
 }
 
 /// \brief Release lock for the address
@@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset        = offset >> 2;
   offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
+  atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
 }
 
 }  // namespace Impl
@@ -149,49 +151,45 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
+}  // namespace Impl
+}  // namespace Kokkos
 
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline
-#else
-static
-#endif
-    void
-    copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
-                                                  &g_host_cuda_lock_arrays,
-                                                  sizeof(CudaLockArrays)));
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
+  {                                                                   \
+    if (::Kokkos::Impl::lock_array_copied == 0) {                     \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
+          cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
+                             &Kokkos::Impl::g_host_cuda_lock_arrays,  \
+                             sizeof(Kokkos::Impl::CudaLockArrays)));  \
+    }                                                                 \
+    lock_array_copied = 1;                                            \
   }
-  lock_array_copied = 1;
-}
 
 #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
 #else
 
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
 // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-  desul::ensure_cuda_lock_arrays_on_device();
-}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #endif
 
 #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
 
-}  // namespace Impl
-}  // namespace Kokkos
-
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index 98733430063d98815ececb0ac9fcf83592a4e681..ac160f8fe268a42e04eebcee2639160e7edbd512 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -465,8 +465,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  public:
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
   using functor_type   = FunctorType;
   using size_type      = Cuda::size_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 
  private:
   // Algorithmic constraints:
@@ -477,7 +493,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
   const FunctorType m_functor;
   const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
   size_type m_final;
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@@ -501,12 +517,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   __device__ inline void initial() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+    word_size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
         word_count.value * threadIdx.y;
 
     final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@@ -532,7 +548,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // gridDim.x
     cuda_single_inter_block_reduce_scan<true>(
         final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
         m_scratch_flags);
   }
 
@@ -541,21 +557,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
   __device__ inline void final() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
         shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
         shared_data + word_count.value * (blockDim.y + 1);
 
     // Starting value for this thread block is the previous block's total.
     if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
           m_scratch_space + word_count.value * (blockIdx.x - 1);
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_accum[i] = block_total[i];
@@ -602,7 +619,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
           typename Analysis::pointer_type(shared_data + word_count.value));
 
       {
-        size_type* const block_total =
+        word_size_type* const block_total =
             shared_data + word_count.value * blockDim.y;
         for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
           shared_accum[i] = block_total[i];
@@ -690,8 +707,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       // How many block are really needed for this much work:
       const int grid_x = (nwork + work_per_block - 1) / work_per_block;
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
 
@@ -752,10 +770,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                                                  Policy, FunctorType>;
 
  public:
+  using value_type     = typename Analysis::value_type;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Cuda::size_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 
  private:
   // Algorithmic constraints:
@@ -766,7 +800,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   const FunctorType m_functor;
   const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
   size_type m_final;
   ReturnType& m_returnvalue;
@@ -791,12 +825,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   __device__ inline void initial() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+    word_size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
         word_count.value * threadIdx.y;
 
     final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@@ -822,7 +856,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // gridDim.x
     cuda_single_inter_block_reduce_scan<true>(
         final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
         m_scratch_flags);
   }
 
@@ -831,21 +865,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   __device__ inline void final() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
         shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
         shared_data + word_count.value * (blockDim.y + 1);
 
     // Starting value for this thread block is the previous block's total.
     if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
           m_scratch_space + word_count.value * (blockIdx.x - 1);
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_accum[i] = block_total[i];
@@ -894,7 +929,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
           typename Analysis::pointer_type(shared_data + word_count.value));
 
       {
-        size_type* const block_total =
+        word_size_type* const block_total =
             shared_data + word_count.value * blockDim.y;
         for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
           shared_accum[i] = block_total[i];
@@ -983,8 +1018,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       // How many block are really needed for this much work:
       const int grid_x = (nwork + work_per_block - 1) / work_per_block;
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
 
@@ -1022,7 +1058,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 #endif
         DeepCopy<HostSpace, CudaSpace, Cuda>(
             m_policy.space(), &m_returnvalue,
-            m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
+            m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
+            size);
     }
   }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 078315b65dd20d37adc6973f5ffff3a94836236b..34d4bef9fdaaf038a2fcdab257d043438a348887 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
   value = result[0];
   for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
     reducer.join(&value, &result[i]);
+  __syncthreads();
 }
 
 template <class ValueType, class ReducerType>
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 5c871e0d615fc58bb01b93566bc0ab7a0ad892b2..dca1fb9073e6de4f5889e0ae61c0f5a5787254de 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -448,11 +448,27 @@ class ParallelScanHIPBase {
                                                  Policy, FunctorType>;
 
  public:
+  using value_type     = typename Analysis::value_type;
   using pointer_type   = typename Analysis::pointer_type;
   using reference_type = typename Analysis::reference_type;
   using functor_type   = FunctorType;
   using size_type      = Kokkos::Experimental::HIP::size_type;
   using index_type     = typename Policy::index_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 
  protected:
   // Algorithmic constraints:
@@ -463,10 +479,10 @@ class ParallelScanHIPBase {
 
   const FunctorType m_functor;
   const Policy m_policy;
-  size_type* m_scratch_space = nullptr;
-  size_type* m_scratch_flags = nullptr;
-  size_type m_final          = false;
-  int m_grid_x               = 0;
+  word_size_type* m_scratch_space = nullptr;
+  size_type* m_scratch_flags      = nullptr;
+  size_type m_final               = false;
+  int m_grid_x                    = 0;
   // Only let one ParallelReduce/Scan modify the shared memory. The
   // constructor acquires the mutex which is released in the destructor.
   std::lock_guard<std::mutex> m_shared_memory_lock;
@@ -489,12 +505,12 @@ class ParallelScanHIPBase {
   __device__ inline void initial() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
     pointer_type const shared_value = reinterpret_cast<pointer_type>(
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
         word_count.value * threadIdx.y);
 
     final_reducer.init(shared_value);
@@ -518,7 +534,7 @@ class ParallelScanHIPBase {
     // gridDim.x
     hip_single_inter_block_reduce_scan<true>(
         final_reducer, blockIdx.x, gridDim.x,
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
         m_scratch_space, m_scratch_flags);
   }
 
@@ -527,22 +543,22 @@ class ParallelScanHIPBase {
   __device__ inline void final() const {
     typename Analysis::Reducer final_reducer(&m_functor);
 
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
 
     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
     // value[2] , ... }
-    size_type* const shared_data =
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
         shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
         shared_data + word_count.value * (blockDim.y + 1);
 
     // Starting value for this thread block is the previous block's total.
     if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
           m_scratch_space + word_count.value * (blockIdx.x - 1);
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_accum[i] = block_total[i];
@@ -588,7 +604,7 @@ class ParallelScanHIPBase {
           typename Analysis::pointer_type(shared_data + word_count.value));
 
       {
-        size_type* const block_total =
+        word_size_type* const block_total =
             shared_data + word_count.value * blockDim.y;
         for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
           shared_accum[i] = block_total[i];
@@ -647,8 +663,9 @@ class ParallelScanHIPBase {
       // How many block are really needed for this much work:
       m_grid_x = (nwork + work_per_block - 1) / work_per_block;
 
-      m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
+      m_scratch_space = reinterpret_cast<word_size_type*>(
+          Kokkos::Experimental::Impl::hip_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
       m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
           m_policy.space(), sizeof(size_type) * 1);
 
@@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
       DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
                Kokkos::Experimental::HIP>(
           Base::m_policy.space(), &m_returnvalue,
-          Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
+          Base::m_scratch_space + (Base::m_grid_x - 1) * size /
+                                      sizeof(typename Base::word_size_type),
           size);
     }
   }
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
index 1091ad5ceadf6a14b2f162c49d797c6c9564d390..9002f695894e987d34fd1437bbcc1f3da7e2a04c 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
     }
   }
 
+  template <typename SizeType>
   __device__ static inline bool scalar_inter_block_reduction(
       FunctorType const& functor,
       ::Kokkos::Experimental::HIP::size_type const block_count,
-      ::Kokkos::Experimental::HIP::size_type* const shared_data,
-      ::Kokkos::Experimental::HIP::size_type* const global_data,
+      SizeType* const shared_data, SizeType* const global_data,
       ::Kokkos::Experimental::HIP::size_type* const global_flags) {
     Scalar* const global_team_buffer_element =
         reinterpret_cast<Scalar*>(global_data);
@@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan_impl(
     FunctorType const& functor,
     ::Kokkos::Experimental::HIP::size_type const block_id,
     ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
-    ::Kokkos::Experimental::HIP::size_type* const global_data,
+    SizeType* const shared_data, SizeType* const global_data,
     ::Kokkos::Experimental::HIP::size_type* const global_flags) {
-  using size_type = ::Kokkos::Experimental::HIP::size_type;
-
+  using size_type    = SizeType;
   using value_type   = typename FunctorType::value_type;
   using pointer_type = typename FunctorType::pointer_type;
 
@@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
   return is_last_block;
 }
 
-template <bool DoScan, typename FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan(
     FunctorType const& functor,
     ::Kokkos::Experimental::HIP::size_type const block_id,
     ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
-    ::Kokkos::Experimental::HIP::size_type* const global_data,
+    SizeType* const shared_data, SizeType* const global_data,
     ::Kokkos::Experimental::HIP::size_type* const global_flags) {
   // If we are doing a reduction and we don't do an array reduction, we use the
   // reduction-only path. Otherwise, we use the common path between reduction
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
index eb85ed4709ed453f40856b05b07e76fd50e06430..d0bbc18da8a1c64839444f5abb8c4d507a01a30b 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
@@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
   value = result[0];
   for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
     reducer.join(&value, &result[i]);
+  __syncthreads();
 }
 
 template <typename ValueType, typename ReducerType>
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
index 0a66ee9da71fdaf3bb2bf649fb1a7081e4651ea4..d859a5d8ae0f1908b35c4dc31aa9229cfd578bf6 100644
--- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -3711,12 +3711,13 @@ namespace Impl {
 
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::View<T, P...>::memory_space,
-         typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<
-         typename Kokkos::View<T, P...>::data_type,
-         typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::View<T, P...>::memory_space,
+             typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::View<T, P...>::data_type,
+             typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
     typename Kokkos::View<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
@@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
 
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::View<T, P...>::memory_space,
-          typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::View<T, P...>::data_type,
-          typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<typename Kokkos::View<T, P...>::memory_space,
+                       typename Kokkos::View<
+                           T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::View<T, P...>::data_type,
+              typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
     typename Kokkos::View<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
@@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
 }
 
 // Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
   return src;
 }
 
 // Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::View<T, P...>& src,
                    const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
-  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
-  alloc_prop prop_copy(arg_prop);
-
-  return Kokkos::Impl::create_mirror(src, prop_copy);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl
 
@@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
 template <class Space, class T, class... P,
           typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
+    Kokkos::Impl::WithoutInitializing_t wi, Space const&,
     Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(space, v, view_alloc(wi));
+  return Impl::create_mirror_view(
+      v, view_alloc(typename Space::memory_space{}, wi));
 }
 
 template <class T, class... P, class... ViewCtorArgs>
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index e92ed7d2e91395aef45292b3a3b3a4f5c9cd5cf7..f8dcfc869e195b93f2ed899b89f6f64be62c46e3 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
 };
 
 template <unsigned N, typename... Args>
-KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
+KOKKOS_FUNCTION std::enable_if_t<
+    N == View<Args...>::Rank &&
+        std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
+    View<Args...>>
 as_view_of_rank_n(View<Args...> v) {
   return v;
 }
@@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
 // Placeholder implementation to compile generic code for DynRankView; should
 // never be called
 template <unsigned N, typename T, typename... Args>
-std::enable_if_t<
-    N != View<T, Args...>::Rank,
+KOKKOS_FUNCTION std::enable_if_t<
+    N != View<T, Args...>::Rank &&
+        std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
     View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
          Args...>>
 as_view_of_rank_n(View<T, Args...>) {
-  Kokkos::Impl::throw_runtime_exception(
-      "Trying to get at a View of the wrong rank");
+  Kokkos::abort("Trying to get at a View of the wrong rank");
   return {};
 }
 
diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index fafd825df297123e100ccf008069f24e4a2cf1e5..129a489387a46297bdd5ce17d3ad7873cbc1c80b 100644
--- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   void push_work(const std::int32_t w) const noexcept {
     const std::int32_t N = m_graph.numRows();
 
-    std::int32_t volatile* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const end_hint    = &m_queue[2 * N + 1];
+    std::int32_t* const ready_queue = &m_queue[0];
+    std::int32_t* const end_hint    = &m_queue[2 * N + 1];
 
     // Push work to end of queue
     const std::int32_t j = atomic_fetch_add(end_hint, 1);
@@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
   std::int32_t pop_work() const noexcept {
     const std::int32_t N = m_graph.numRows();
 
-    std::int32_t volatile* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const begin_hint  = &m_queue[2 * N];
+    std::int32_t* const ready_queue = &m_queue[0];
+    std::int32_t* const begin_hint  = &m_queue[2 * N];
 
     // begin hint is guaranteed to be less than or equal to
     // actual begin location in the queue.
 
-    for (std::int32_t i = *begin_hint; i < N; ++i) {
-      const std::int32_t w = ready_queue[i];
+    for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
+      const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
 
       if (w == END_TOKEN) {
         return END_TOKEN;
@@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
     const std::int32_t N = m_graph.numRows();
 
-    std::int32_t volatile* const count_queue = &m_queue[N];
+    std::int32_t* const count_queue = &m_queue[N];
 
     const std::int32_t B = m_graph.row_map(w);
     const std::int32_t E = m_graph.row_map(w + 1);
@@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TagCount, int i) const noexcept {
-    std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
+    std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
 
     atomic_increment(count_queue + m_graph.entries[i]);
   }
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 51921765baf249a1f1dacc57221bc4f4a398c79d..a9bc085912356ec90bbef0c63688cd3f91d11a95 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -47,6 +47,7 @@
 #endif
 
 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
 
@@ -164,7 +165,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
       name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
-void OpenMPTarget::impl_initialize(InitializationSettings const&) {
+void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
+  using Kokkos::Impl::get_gpu;
+  const int device_num = get_gpu(settings);
+  omp_set_default_device(device_num);
+
   Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
 }
 void OpenMPTarget::impl_finalize() {
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index c1cb6a7d91b54b951f445131e43347eaaa33a027..ecece72cf958c937702dd18b4a22642e479c857c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
 
   return ((uint64_t)a) | (((uint64_t)d) << 32);
 
-#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
-    defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
+#elif defined(__powerpc64__) || defined(__ppc64__)
 
-  unsigned int cycles = 0;
+  unsigned long cycles = 0;
 
   asm volatile("mftb %0" : "=r"(cycles));
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index f624e7a14cb21b4a395898125536ec9b55bfeaae..a5bd0032374ffc5e0e73627e33aeb3fa7c1b788e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -166,6 +166,8 @@ int get_device_count() {
 #elif defined(KOKKOS_ENABLE_OPENACC)
   return acc_get_num_devices(
       Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+  return omp_get_num_devices();
 #else
   Kokkos::abort("implementation bug");
   return -1;
@@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
     Kokkos::abort("implementation bug");
   }
 
-  auto const* local_rank_str =
-      std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");  // OpenMPI
-  if (!local_rank_str)
-    local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
-  if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
+  char const* local_rank_str = nullptr;
+  for (char const* env_var : {
+           "OMPI_COMM_WORLD_LOCAL_RANK",  // OpenMPI
+           "MV2_COMM_WORLD_LOCAL_RANK",   // MVAPICH2
+           "MPI_LOCALRANKID",             // MPICH
+           "SLURM_LOCALID",               // SLURM
+           "PMI_LOCAL_RANK"               // PMI
+       }) {
+    local_rank_str = std::getenv(env_var);
+    if (local_rank_str) break;
+  }
 
   // use first GPU available for execution if unable to detect local MPI rank
   if (!local_rank_str) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index 738231677c600f6d928122269d848b1a2b51ac46..994dd0b2adf65d9a2440abeb5c4930b43a662d54 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -1128,9 +1128,8 @@ struct ViewOffset<
   KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
       const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
       : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
-    static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
-                       dimension_type::rank_dynamic == 1),
+    static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
+                   (DimRHS::rank == 1 && dimension_type::rank == 1)),
                   "ViewOffset LayoutLeft and LayoutRight are only compatible "
                   "when rank <= 1");
   }
@@ -1778,8 +1777,7 @@ struct ViewOffset<
       const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
       : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
     static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
-                       dimension_type::rank_dynamic == 1),
+                      (DimRHS::rank == 1 && dimension_type::rank == 1),
                   "ViewOffset LayoutRight and LayoutLeft are only compatible "
                   "when rank <= 1");
   }
@@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
                    std::is_trivially_copy_assignable<Dummy>::value>
   construct_shared_allocation() {
     // Shortcut for zero initialization
-    ValueType value{};
 // On A64FX memset seems to do the wrong thing with regards to first touch
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
+    ValueType value{};
     if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
@@ -3539,9 +3537,7 @@ class ViewMapping<
                      typename SrcTraits::array_layout>::value ||
         std::is_same<typename DstTraits::array_layout,
                      Kokkos::LayoutStride>::value ||
-        (DstTraits::dimension::rank == 0) ||
-        (DstTraits::dimension::rank == 1 &&
-         DstTraits::dimension::rank_dynamic == 1)
+        (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
   };
 
  public:
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 24f70c0ccb3208ca3db1acf82e08bfb56d1ef0de..16fdb39d1a36e9dd8b7d65bbe846c28b37fcf496 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
 SET(COMPILE_ONLY_SOURCES
   TestArray.cpp
+  TestCreateMirror.cpp
   TestDetectionIdiom.cpp
   TestInterOp.cpp
   TestLegionInteroperability.cpp
@@ -86,6 +87,7 @@ ENDIF()
 KOKKOS_ADD_EXECUTABLE(
   TestCompileOnly
   SOURCES
+  TestCompileMain.cpp
   ${COMPILE_ONLY_SOURCES}
 )
 
diff --git a/packages/kokkos/core/unit_test/TestCompileMain.cpp b/packages/kokkos/core/unit_test/TestCompileMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..237c8ce181774d991a9dbdd8cacf1a5fb9f199f1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCompileMain.cpp
@@ -0,0 +1 @@
+int main() {}
diff --git a/packages/kokkos/core/unit_test/TestCreateMirror.cpp b/packages/kokkos/core/unit_test/TestCreateMirror.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8b3b6ea105950267a862a78ebcb632120e74f66
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCreateMirror.cpp
@@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+template <typename TestView, typename MemorySpace>
+void check_memory_space(TestView, MemorySpace) {
+  static_assert(
+      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
+}
+
+template <class View>
+auto host_mirror_test_space(View) {
+  return std::conditional_t<
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename View::memory_space>::accessible,
+      typename View::memory_space, Kokkos::HostSpace>{};
+}
+
+template <typename View>
+void test_create_mirror_properties(const View& view) {
+  using namespace Kokkos;
+  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
+
+  // clang-format off
+  
+  // create_mirror
+  check_memory_space(create_mirror(WithoutInitializing,                          view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(                                              view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view
+  check_memory_space(create_mirror_view(WithoutInitializing,                          view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(                                              view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc + execution space
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc + execution space
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy
+  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc + execution space
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
+
+  // clang-format on
+}
+
+void test() {
+  Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
+                                                                10);
+  Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
diff --git a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
index f87fda615643c5a75ef5ee4da7349bab0eea40cd..23da339cae03246084c0e67e8781d08972fb864e 100644
--- a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
+++ b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
@@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
 static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
               "Bark's difference_type should be ptrdiff_t!");
 }  // namespace Example
-
-int main() {}
diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp
index 1a4056af07d3f9584b105cd536e5abca051b30c2..356ffde9565aaf40035e033748056cbb3028f678 100644
--- a/packages/kokkos/core/unit_test/TestScan.hpp
+++ b/packages/kokkos/core/unit_test/TestScan.hpp
@@ -45,20 +45,23 @@
 #include <Kokkos_Core.hpp>
 #include <cstdio>
 
-namespace Test {
+namespace {
 
-template <class Device>
+template <class Device, class T, T ImbalanceSz>
 struct TestScan {
   using execution_space = Device;
-  using value_type      = int64_t;
+  using value_type      = T;
 
   Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int iwork, value_type& update,
                   const bool final_pass) const {
-    const value_type n         = iwork + 1;
-    const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
+    const value_type n = iwork + 1;
+    const value_type imbalance =
+        ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
+            ? ImbalanceSz
+            : value_type(0);
 
     // Insert an artificial load imbalance
 
@@ -133,12 +136,29 @@ struct TestScan {
     }
   }
 };
+}  // namespace
 
 TEST(TEST_CATEGORY, scan) {
-  TestScan<TEST_EXECSPACE>::test_range(1, 1000);
-  TestScan<TEST_EXECSPACE>(0);
-  TestScan<TEST_EXECSPACE>(100000);
-  TestScan<TEST_EXECSPACE>(10000000);
-  TEST_EXECSPACE().fence();
+  constexpr auto imbalance_size = 1000;
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
+}
+
+TEST(TEST_CATEGORY, small_size_scan) {
+  constexpr auto imbalance_size = 10;  // Pick to not overflow...
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
+      static_cast<std::size_t>(
+          std::sqrt(std::numeric_limits<std::int8_t>::max())));
+  constexpr auto short_imbalance_size = 100;  // Pick to not overflow...
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
+      static_cast<std::size_t>(
+          std::sqrt(std::numeric_limits<std::int16_t>::max())));
 }
-}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index f1d0f9cb3b8a37f35f9b4962e2f183f26701072c..3f05b2ef66a04783a94f854259253cb984411819 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
 
 }  // namespace
 
+namespace {
+template <typename ExecutionSpace>
+struct TestRepeatedTeamReduce {
+  static constexpr int ncol = 1500;  // nothing special, just some work
+
+  KOKKOS_FUNCTION void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
+      const {
+    // non-divisible by power of two to make triggering problems easier
+    constexpr int nlev = 129;
+    constexpr auto pi  = Kokkos::Experimental::pi_v<double>;
+    double b           = 0.;
+    for (int ri = 0; ri < 10; ++ri) {
+      // The contributions here must be sufficiently complex, simply adding ones
+      // wasn't enough to trigger the bug.
+      const auto g1 = [&](const int k, double &acc) {
+        acc += Kokkos::cos(pi * double(k) / nlev);
+      };
+      const auto g2 = [&](const int k, double &acc) {
+        acc += Kokkos::sin(pi * double(k) / nlev);
+      };
+      double a1, a2;
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
+      b += a1;
+      b += a2;
+    }
+    const auto h = [&]() {
+      const auto col = team.league_rank();
+      v(col)         = b + col;
+    };
+    Kokkos::single(Kokkos::PerTeam(team), h);
+  }
+
+  KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
+    if (v(i) != v(0) + i) {
+      ++bad;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
+    }
+  }
+
+  TestRepeatedTeamReduce() : v("v", ncol) { test(); }
+
+  void test() {
+    int team_size_recommended =
+        Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
+            *this, Kokkos::ParallelForTag());
+    // Choose a non-recommened (non-power of two for GPUs) team size
+    int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
+
+    // The failure was non-deterministic so run the test a bunch of times
+    for (int it = 0; it < 100; ++it) {
+      Kokkos::parallel_for(
+          Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
+
+      int bad = 0;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
+                              *this, bad);
+      ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
+    }
+  }
+
+  Kokkos::View<double *, ExecutionSpace> v;
+};
+
+}  // namespace
+
 }  // namespace Test
 
 /*--------------------------------------------------------------------------*/
diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 469bba23b73ee9bd316f7c2fbcd9389144f03e12..4d4f3b1f4d34eeae219b9436922162f7279ac8ca 100644
--- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
   }
 }
 
+TEST(TEST_CATEGORY, repeated_team_reduce) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
+                    "properly implemented";
+#endif
+
+  TestRepeatedTeamReduce<TEST_EXECSPACE>();
+}
+
 }  // namespace Test
 #endif
diff --git a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
index 03c3b977edeab7ec5b51c406da65b0e089f5a0de..3ac392d3e98860fe536a07fbff43cc7d5fc1aecd 100644
--- a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
+++ b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp
@@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
                           View<double*, left, d_exec>>::test(false, false, 10);
 
   // Layout assignment
+  Impl::TestAssignability<View<int, left, d_exec>,
+                          View<int, right, d_exec>>::test(true, true);
   Impl::TestAssignability<View<int*, left, d_exec>,
                           View<int*, right, d_exec>>::test(true, true, 10);
+  Impl::TestAssignability<View<int[5], left, d_exec>,
+                          View<int*, right, d_exec>>::test(false, false, 10);
+  Impl::TestAssignability<View<int[10], left, d_exec>,
+                          View<int*, right, d_exec>>::test(false, true, 10);
+  Impl::TestAssignability<View<int*, left, d_exec>,
+                          View<int[5], right, d_exec>>::test(true, true);
+  Impl::TestAssignability<View<int[5], left, d_exec>,
+                          View<int[10], right, d_exec>>::test(false, false);
 
   // This could be made possible (due to the degenerate nature of the views) but
   // we do not allow this yet
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index a1a87ce3199d10449b92be4a8e09ecaa790a303f..bd639c847e03cdd0909fc83ccf6d0843148d6bea 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -29,3 +29,4 @@ tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
 tag:  3.6.00     date: 04:14:2022    master: 2834f94a    release: 6ea708ff
 tag:  3.6.01     date: 06:16:2022    master: b52f8c83    release: afe9b404
 tag:  3.7.00     date: 08:25:2022    master: d19aab99    release: 0018e5fb
+tag:  3.7.01     date: 12:01:2022    master: 61d7db55    release: d3bb8cfe
diff --git a/packages/kokkos/simd/cmake/Dependencies.cmake b/packages/kokkos/simd/cmake/Dependencies.cmake
index 5e29157369c9ab8cab935a1bfc4c6dad2fdd0296..1d71d8af341181f689a6a8bf63036b67584cb138 100644
--- a/packages/kokkos/simd/cmake/Dependencies.cmake
+++ b/packages/kokkos/simd/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
   LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
   TEST_OPTIONAL_TPLS CUSPARSE
   )
diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
index 2166fa3cb78e70af887ff7f74e2cac9f141bf1de..1815adb4a7621c8b4b3d93ac626c417f1c42644b 100644
--- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
+++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
@@ -76,7 +76,7 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef __CUDACC_RDC__
     __constant__ extern
@@ -138,42 +138,33 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
-
-#ifdef __CUDACC_RDC__
-inline
-#else
-static
-#endif
-    void
-    copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
-    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
-                       &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
-                       sizeof(int32_t*));
-    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
-                       &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
-                       sizeof(int32_t*));
-  }
-  lock_array_copied = 1;
-}
-
 }  // namespace Impl
 }  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,      \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h,   \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }
 
 #endif /* defined( __CUDACC__ ) */
 
 #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
 
-namespace desul {
-
 #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-static inline void ensure_cuda_lock_arrays_on_device() {
-  Impl::copy_cuda_lock_arrays_to_device();
-}
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
-}  // namespace desul
-
-#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
index 19944b378e2c47090dbe3ce28913017a3f308933..cb8482c5da8b83bb1fc6323dea09fffce86d115b 100644
--- a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
+++ b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
                              "init_lock_arrays_cuda: cudaMalloc host locks");
 
   auto error_sync1 = cudaDeviceSynchronize();
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
   init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
   auto error_sync2 = cudaDeviceSynchronize();
@@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
   CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 #ifdef __CUDACC_RDC__
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }