From 95b3a702d689cb12fed8c321811e8afcff4485d4 Mon Sep 17 00:00:00 2001
From: Stephane Del Pino <stephane.delpino44@gmail.com>
Date: Sat, 3 Sep 2022 18:40:54 +0200
Subject: [PATCH] git subrepo pull (merge) --force packages/kokkos

subrepo:
  subdir:   "packages/kokkos"
  merged:   "d19aab998"
upstream:
  origin:   "git@github.com:kokkos/kokkos.git"
  branch:   "master"
  commit:   "d19aab998"
git-subrepo:
  version:  "0.4.3"
  origin:   "git@github.com:ingydotnet/git-subrepo.git"
  commit:   "2f68596"
---
 .../continuous-integration-workflow.yml       | 21 +++--
 packages/kokkos/.gitrepo                      |  4 +-
 packages/kokkos/CHANGELOG.md                  | 16 ++++
 packages/kokkos/CMakeLists.txt                |  2 +-
 packages/kokkos/Makefile.kokkos               |  2 +-
 .../kokkos/algorithms/src/Kokkos_Sort.hpp     | 79 ++++++++-----------
 .../kokkos/algorithms/unit_tests/TestSort.hpp | 58 ++++++++++++++
 .../containers/src/Kokkos_ScatterView.hpp     | 40 +---------
 .../kokkos/containers/src/Kokkos_Vector.hpp   | 18 ++---
 .../containers/unit_tests/TestVector.hpp      | 17 ++++
 packages/kokkos/core/src/CMakeLists.txt       |  1 +
 .../core/src/Cuda/Kokkos_Cuda_Instance.cpp    |  9 +++
 .../kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp |  2 +-
 .../core/src/HIP/Kokkos_HIP_Instance.cpp      |  4 +-
 .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp  |  2 +
 .../kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp  |  3 +-
 .../kokkos/core/src/HIP/Kokkos_HIP_Space.cpp  |  9 +++
 packages/kokkos/core/src/HPX/Kokkos_HPX.cpp   |  9 +++
 packages/kokkos/core/src/Kokkos_Cuda.hpp      |  1 +
 packages/kokkos/core/src/Kokkos_HIP_Space.hpp |  3 +
 packages/kokkos/core/src/Kokkos_HPX.hpp       |  1 +
 packages/kokkos/core/src/Kokkos_OpenMP.hpp    |  1 +
 .../kokkos/core/src/Kokkos_OpenMPTarget.hpp   |  3 +
 packages/kokkos/core/src/Kokkos_SYCL.hpp      |  3 +
 packages/kokkos/core/src/Kokkos_Serial.hpp    |  1 +
 packages/kokkos/core/src/Kokkos_Threads.hpp   |  1 +
 .../core/src/OpenMP/Kokkos_OpenMP_Exec.cpp    | 14 +++-
 .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp    | 10 ++-
 .../core/src/SYCL/Kokkos_SYCL_Instance.hpp    |  2 +-
 .../core/src/Threads/Kokkos_ThreadsExec.cpp   | 70 +++++++++++++---
 .../core/src/Threads/Kokkos_ThreadsExec.hpp   |  5 +-
 .../src/impl/Kokkos_Profiling_Interface.hpp   |  7 +-
 .../kokkos/core/src/impl/Kokkos_Serial.cpp    |  9 +++
 .../core/src/impl/Kokkos_ViewMapping.hpp      | 59 +++++++-------
 .../core/src/impl/Kokkos_ViewTracker.hpp      |  2 +
 .../kokkos/core/unit_test/TestViewAPI.hpp     | 14 ++++
 .../kokkos/core/unit_test/TestViewAPI_e.hpp   | 29 +++++++
 .../unit_test/tools/TestEventCorrectness.hpp  | 15 ++--
 packages/kokkos/master_history.txt            |  1 +
 .../docker/Dockerfile.kokkosllvmproject       |  2 +
 .../kokkos/scripts/docker/Dockerfile.nvcc     |  2 +
 .../scripts/docker/Dockerfile.openmptarget    |  2 +-
 .../kokkos/scripts/docker/Dockerfile.sycl     |  2 +
 43 files changed, 385 insertions(+), 170 deletions(-)

diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index f6a8f2230..dba3a7074 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -13,24 +13,32 @@ jobs:
         distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest']
         cxx: ['g++', 'clang++']
         cmake_build_type: ['Release', 'Debug']
-        openmp: ['ON']
+        backend: ['OPENMP']
         include:
           - distro: 'fedora:intel'
             cxx: 'icpc'
             cmake_build_type: 'Release'
-            openmp: 'ON'
+            backend: 'OPENMP'
           - distro: 'fedora:intel'
             cxx: 'icpc'
             cmake_build_type: 'Debug'
-            openmp: 'ON'
+            backend: 'OPENMP'
           - distro: 'fedora:intel'
             cxx: 'icpx'
             cmake_build_type: 'Release'
-            openmp: 'ON'
+            backend: 'OPENMP'
           - distro: 'fedora:intel'
             cxx: 'icpx'
             cmake_build_type: 'Debug'
-            openmp: 'ON'
+            backend: 'OPENMP'
+          - distro: 'ubuntu:latest'
+            cxx: 'clang++'
+            cmake_build_type: 'RelWithDebInfo'
+            backend: 'THREADS'
+          - distro: 'ubuntu:latest'
+            cxx: 'g++'
+            cmake_build_type: 'RelWithDebInfo'
+            backend: 'THREADS'
     runs-on: ubuntu-latest
     container:
       image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
@@ -54,8 +62,9 @@ jobs:
         run: |
           cmake -B builddir \
             -DCMAKE_INSTALL_PREFIX=/usr \
+            -DKokkos_ARCH_NATIVE=ON \
             -DKokkos_ENABLE_HWLOC=ON \
-            -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \
+            -DKokkos_ENABLE_${{ matrix.backend }}=ON \
             -DKokkos_ENABLE_TESTS=ON \
             -DKokkos_ENABLE_EXAMPLES=ON \
             -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index 6b9388486..91e0f8daa 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = b52f8c835f4df003954dad66d9761094f8baa66c
-	parent = 1994bb4f069142aa3f886d30aa0585ed9117eed1
+	commit = d19aab9981a2c447e832a7b4eb7b16992328fb14
+	parent = a64ea7589ca011edd41ad9a3468d091cd093430c
 	method = merge
 	cmdver = 0.4.3
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index dfbe22edd..a90850770 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Change Log
 
+## [3.6.01](https://github.com/kokkos/kokkos/tree/3.6.01) (2022-05-23)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.00...3.6.01)
+
+### Bug Fixes:
+- Fix Threads: Fix serial resizing scratch space (3.6.01 cherry-pick) [\#5109](https://github.com/kokkos/kokkos/pull/5109)
+- Fix ScatterMin/ScatterMax to use proper atomics (3.6.01 cherry-pick) [\#5046](https://github.com/kokkos/kokkos/pull/5046)
+- Fix allocating large Views [\#4907](https://github.com/kokkos/kokkos/pull/4907)
+- Fix bounds errors with Kokkos::sort [\#4980](https://github.com/kokkos/kokkos/pull/4980)
+- Fix HIP version when printing the configuration [\#4872](https://github.com/kokkos/kokkos/pull/4872)
+- Fixed `_CUDA_ARCH__` to `__CUDA_ARCH__` for CUDA LDG [\#4893](https://github.com/kokkos/kokkos/pull/4893)
+- Fixed an incorrect struct initialization [\#5028](https://github.com/kokkos/kokkos/pull/5028)
+- Fix racing condition in `HIPParallelLaunch` [\#5008](https://github.com/kokkos/kokkos/pull/5008)
+- Avoid deprecation warnings with `OpenMPExec::validate_partition` [\#4982](https://github.com/kokkos/kokkos/pull/4982)
+- Make View self-assignment not produce double-free [\#5024](https://github.com/kokkos/kokkos/pull/5024)
+
+
 ## [3.6.00](https://github.com/kokkos/kokkos/tree/3.6.00) (2022-02-18)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.5.00...3.6.00)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index e1c689372..b0a54118a 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -136,7 +136,7 @@ ENDIF()
 
 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 6)
-set(Kokkos_VERSION_PATCH 00)
+set(Kokkos_VERSION_PATCH 01)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index b1afed5d0..10c4bc463 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -2,7 +2,7 @@
 
 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 6
-KOKKOS_VERSION_PATCH = 00
+KOKKOS_VERSION_PATCH = 01
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index cde5e6857..ce97de9b7 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -422,54 +422,34 @@ class BinSort {
 
 template <class KeyViewType>
 struct BinOp1D {
-  int max_bins_;
-  double mul_;
-  typename KeyViewType::const_value_type range_;
-  typename KeyViewType::const_value_type min_;
+  int max_bins_ = {};
+  double mul_   = {};
+  double min_   = {};
 
-  BinOp1D()
-      : max_bins_(0),
-        mul_(0.0),
-        range_(typename KeyViewType::const_value_type()),
-        min_(typename KeyViewType::const_value_type()) {}
+  BinOp1D() = default;
 
   // Construct BinOp with number of bins, minimum value and maxuimum value
   BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
           typename KeyViewType::const_value_type max)
       : max_bins_(max_bins__ + 1),
-        // Cast to int64_t to avoid possible overflow when using integer
-        mul_(std::is_integral<typename KeyViewType::const_value_type>::value
-                 ? 1.0 * max_bins__ / (int64_t(max) - int64_t(min))
-                 : 1.0 * max_bins__ / (max - min)),
-        range_(max - min),
-        min_(min) {
+        // Cast to double to avoid possible overflow when using integer
+        mul_(static_cast<double>(max_bins__) /
+             (static_cast<double>(max) - static_cast<double>(min))),
+        min_(static_cast<double>(min)) {
     // For integral types the number of bins may be larger than the range
     // in which case we can exactly have one unique value per bin
     // and then don't need to sort bins.
     if (std::is_integral<typename KeyViewType::const_value_type>::value &&
-        static_cast<uint64_t>(range_) <= static_cast<uint64_t>(max_bins__)) {
+        (static_cast<double>(max) - static_cast<double>(min)) <=
+            static_cast<double>(max_bins__)) {
       mul_ = 1.;
     }
   }
 
   // Determine bin index from key value
-  template <
-      class ViewType,
-      std::enable_if_t<!std::is_integral<typename ViewType::value_type>::value,
-                       bool> = true>
-  KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
-    return int(mul_ * (keys(i) - min_));
-  }
-
-  // Determine bin index from key value
-  template <
-      class ViewType,
-      std::enable_if_t<std::is_integral<typename ViewType::value_type>::value,
-                       bool> = true>
+  template <class ViewType>
   KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const {
-    // The cast to int64_t is necessary because otherwise HIP returns the wrong
-    // result.
-    return int(mul_ * (int64_t(keys(i)) - int64_t(min_)));
+    return static_cast<int>(mul_ * (static_cast<double>(keys(i)) - min_));
   }
 
   // Return maximum bin index + 1
@@ -486,10 +466,9 @@ struct BinOp1D {
 
 template <class KeyViewType>
 struct BinOp3D {
-  int max_bins_[3];
-  double mul_[3];
-  typename KeyViewType::non_const_value_type range_[3];
-  typename KeyViewType::non_const_value_type min_[3];
+  int max_bins_[3] = {};
+  double mul_[3]   = {};
+  double min_[3]   = {};
 
   BinOp3D() = default;
 
@@ -498,15 +477,15 @@ struct BinOp3D {
     max_bins_[0] = max_bins__[0];
     max_bins_[1] = max_bins__[1];
     max_bins_[2] = max_bins__[2];
-    mul_[0]      = 1.0 * max_bins__[0] / (max[0] - min[0]);
-    mul_[1]      = 1.0 * max_bins__[1] / (max[1] - min[1]);
-    mul_[2]      = 1.0 * max_bins__[2] / (max[2] - min[2]);
-    range_[0]    = max[0] - min[0];
-    range_[1]    = max[1] - min[1];
-    range_[2]    = max[2] - min[2];
-    min_[0]      = min[0];
-    min_[1]      = min[1];
-    min_[2]      = min[2];
+    mul_[0]      = static_cast<double>(max_bins__[0]) /
+              (static_cast<double>(max[0]) - static_cast<double>(min[0]));
+    mul_[1] = static_cast<double>(max_bins__[1]) /
+              (static_cast<double>(max[1]) - static_cast<double>(min[1]));
+    mul_[2] = static_cast<double>(max_bins__[2]) /
+              (static_cast<double>(max[2]) - static_cast<double>(min[2]));
+    min_[0] = static_cast<double>(min[0]);
+    min_[1] = static_cast<double>(min[1]);
+    min_[2] = static_cast<double>(min[2]);
   }
 
   template <class ViewType>
@@ -596,9 +575,9 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
   // TODO: figure out better max_bins then this ...
   int64_t max_bins = view.extent(0) / 2;
   if (std::is_integral<typename ViewType::non_const_value_type>::value) {
-    // Cast to int64_t to avoid possible overflow when using integer
-    int64_t const max_val = result.max_val;
-    int64_t const min_val = result.min_val;
+    // Cast to double to avoid possible overflow when using integer
+    auto const max_val = static_cast<double>(result.max_val);
+    auto const min_val = static_cast<double>(result.min_val);
     // using 10M as the cutoff for special behavior (roughly 40MB for the count
     // array)
     if ((max_val - min_val) < 10000000) {
@@ -606,6 +585,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
       sort_in_bins = false;
     }
   }
+  if (std::is_floating_point<typename ViewType::non_const_value_type>::value) {
+    KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) -
+                                static_cast<double>(result.min_val)));
+  }
 
   BinSort<ViewType, CompType> bin_sort(
       view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins);
diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
index a03847f2b..9108731c1 100644
--- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -353,6 +353,55 @@ void test_issue_1160_impl() {
   }
 }
 
+template <class ExecutionSpace>
+void test_issue_4978_impl() {
+  Kokkos::View<long long*, ExecutionSpace> element_("element", 9);
+
+  auto h_element = Kokkos::create_mirror_view(element_);
+
+  h_element(0) = LLONG_MIN;
+  h_element(1) = 0;
+  h_element(2) = 3;
+  h_element(3) = 2;
+  h_element(4) = 1;
+  h_element(5) = 3;
+  h_element(6) = 6;
+  h_element(7) = 4;
+  h_element(8) = 3;
+
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, element_, h_element);
+
+  Kokkos::sort(exec, element_);
+
+  Kokkos::deep_copy(exec, h_element, element_);
+  exec.fence();
+
+  ASSERT_EQ(h_element(0), LLONG_MIN);
+  ASSERT_EQ(h_element(1), 0);
+  ASSERT_EQ(h_element(2), 1);
+  ASSERT_EQ(h_element(3), 2);
+  ASSERT_EQ(h_element(4), 3);
+  ASSERT_EQ(h_element(5), 3);
+  ASSERT_EQ(h_element(6), 3);
+  ASSERT_EQ(h_element(7), 4);
+  ASSERT_EQ(h_element(8), 6);
+}
+
+template <class ExecutionSpace, class T>
+void test_sort_integer_overflow() {
+  // array with two extrema in reverse order to expose integer overflow bug in
+  // bin calculation
+  T a[2]  = {Kokkos::Experimental::finite_max<T>::value,
+            Kokkos::Experimental::finite_min<T>::value};
+  auto vd = Kokkos::create_mirror_view_and_copy(
+      ExecutionSpace(), Kokkos::View<T[2], Kokkos::HostSpace>(a));
+  Kokkos::sort(vd, /*force using Kokkos bin sort*/ true);
+  auto vh = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), vd);
+  EXPECT_TRUE(std::is_sorted(vh.data(), vh.data() + 2))
+      << "view (" << vh[0] << ", " << vh[1] << ") is not sorted";
+}
+
 //----------------------------------------------------------------------------
 
 template <class ExecutionSpace, typename KeyType>
@@ -376,6 +425,11 @@ void test_issue_1160_sort() {
   test_issue_1160_impl<ExecutionSpace>();
 }
 
+template <class ExecutionSpace>
+void test_issue_4978_sort() {
+  test_issue_4978_impl<ExecutionSpace>();
+}
+
 template <class ExecutionSpace, typename KeyType>
 void test_sort(unsigned int N) {
   test_1D_sort<ExecutionSpace, KeyType>(N);
@@ -385,6 +439,10 @@ void test_sort(unsigned int N) {
   test_dynamic_view_sort<ExecutionSpace, KeyType>(N);
 #endif
   test_issue_1160_sort<ExecutionSpace>();
+  test_issue_4978_sort<ExecutionSpace>();
+  test_sort_integer_overflow<ExecutionSpace, long long>();
+  test_sort_integer_overflow<ExecutionSpace, unsigned long long>();
+  test_sort_integer_overflow<ExecutionSpace, int>();
 }
 }  // namespace Impl
 }  // namespace Test
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index 024b4618a..e4dd9531f 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -369,18 +369,6 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, DeviceType,
     Kokkos::atomic_div(&value, rhs);
   }
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  void atomic_prod(ValueType& dest, const ValueType& src) const {
-    bool success = false;
-    while (!success) {
-      ValueType dest_old = dest;
-      ValueType dest_new = dest_old * src;
-      dest_new =
-          Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new);
-      success = ((dest_new - dest_old) / dest_old <= 1e-15);
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void join(ValueType& dest, const ValueType& src) const {
     atomic_prod(&dest, src);
@@ -440,21 +428,9 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, DeviceType,
   KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other)
       : value(other.value) {}
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  void atomic_min(ValueType& dest, const ValueType& src) const {
-    bool success = false;
-    while (!success) {
-      ValueType dest_old = dest;
-      ValueType dest_new = (dest_old > src) ? src : dest_old;
-      dest_new =
-          Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new);
-      success = ((dest_new - dest_old) / dest_old <= 1e-15);
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void join(ValueType& dest, const ValueType& src) const {
-    atomic_min(dest, src);
+    atomic_min(&dest, src);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -511,21 +487,9 @@ struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, DeviceType,
   KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other)
       : value(other.value) {}
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  void atomic_max(ValueType& dest, const ValueType& src) const {
-    bool success = false;
-    while (!success) {
-      ValueType dest_old = dest;
-      ValueType dest_new = (dest_old < src) ? src : dest_old;
-      dest_new =
-          Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new);
-      success = ((dest_new - dest_old) / dest_old <= 1e-15);
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void join(ValueType& dest, const ValueType& src) const {
-    atomic_max(dest, src);
+    atomic_max(&dest, src);
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
index 88721bd89..eddb87800 100644
--- a/packages/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -162,7 +162,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     }
     DV::sync_host();
     DV::modify_host();
-    if (it < begin() || it > end())
+    if (std::less<>()(it, begin()) || std::less<>()(end(), it))
       Kokkos::abort("Kokkos::vector::insert : invalid insert iterator");
     if (count == 0) return it;
     ptrdiff_t start = std::distance(begin(), it);
@@ -189,27 +189,21 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
                           iterator>::type
   insert(iterator it, InputIterator b, InputIterator e) {
     ptrdiff_t count = std::distance(b, e);
-    if (count == 0) return it;
 
     DV::sync_host();
     DV::modify_host();
-    if (it < begin() || it > end())
+    if (std::less<>()(it, begin()) || std::less<>()(end(), it))
       Kokkos::abort("Kokkos::vector::insert : invalid insert iterator");
 
-    bool resized = false;
-    if ((size() == 0) && (it == begin())) {
-      resize(count);
-      it      = begin();
-      resized = true;
-    }
     ptrdiff_t start = std::distance(begin(), it);
     auto org_size   = size();
-    if (!resized) resize(size() + count);
-    it = begin() + start;
+
+    // Note: resize(...) invalidates it; use begin() + start instead
+    resize(size() + count);
 
     std::copy_backward(begin() + start, begin() + org_size,
                        begin() + org_size + count);
-    std::copy(b, e, it);
+    std::copy(b, e, begin() + start);
 
     return begin() + start;
   }
diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp
index 57b92c38f..c093c7b0c 100644
--- a/packages/kokkos/containers/unit_tests/TestVector.hpp
+++ b/packages/kokkos/containers/unit_tests/TestVector.hpp
@@ -172,6 +172,23 @@ struct test_vector_insert {
       run_test(a);
       check_test(a, size);
     }
+    { test_vector_insert_into_empty(size); }
+  }
+
+  void test_vector_insert_into_empty(const size_t size) {
+    using Vector = Kokkos::vector<Scalar, Device>;
+    {
+      Vector a;
+      Vector b(size);
+      a.insert(a.begin(), b.begin(), b.end());
+      ASSERT_EQ(a.size(), size);
+    }
+
+    {
+      Vector c;
+      c.insert(c.begin(), size, Scalar{});
+      ASSERT_EQ(c.size(), size);
+    }
   }
 };
 
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index 88cca93f3..793e07a84 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -8,6 +8,7 @@ KOKKOS_INCLUDE_DIRECTORIES(
 INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
+  FILES_MATCHING
   PATTERN desul/src EXCLUDE
   PATTERN "*.inc"
   PATTERN "*.inc_*"
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 294be2774..aaa9ea8ad 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -1007,6 +1007,15 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<Cuda>::id;
+}
+}  // namespace Tools
+#endif
+
 }  // namespace Kokkos
 
 #else
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index 61563a010..dec6ef15e 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -139,7 +139,7 @@ struct CudaLDGFetch {
 
   template <typename iType>
   KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const {
-#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
     AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
     return *(reinterpret_cast<ValueType*>(&v));
 #else
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 4a6a3ba99..a8a0496af 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -132,7 +132,8 @@ void HIPInternal::print_configuration(std::ostream &s) const {
   s << "macro  KOKKOS_ENABLE_HIP : defined" << '\n';
 #if defined(HIP_VERSION)
   s << "macro  HIP_VERSION = " << HIP_VERSION << " = version "
-    << HIP_VERSION / 100 << "." << HIP_VERSION % 100 << '\n';
+    << HIP_VERSION_MAJOR << '.' << HIP_VERSION_MINOR << '.' << HIP_VERSION_PATCH
+    << '\n';
 #endif
 
   for (int i = 0; i < dev_info.m_hipDevCount; ++i) {
@@ -467,7 +468,6 @@ void HIPInternal::finalize() {
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
-  std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
   if (d_driverWorkArray == nullptr) {
     KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index 384b7ffd6..70b979e00 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -490,6 +490,8 @@ struct HIPParallelLaunch<
 
       KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
 
+      std::lock_guard<std::mutex> const lock(hip_instance->m_mutexWorkArray);
+
       // Invoke the driver function on the device
       DriverType *d_driver = reinterpret_cast<DriverType *>(
           hip_instance->get_next_driver(sizeof(DriverType)));
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index f334d9341..e9cfbf99f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -56,8 +56,7 @@ namespace Kokkos {
 
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 namespace Impl {
-__device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr,
-                                                                  nullptr, 0};
+__device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, 0};
 }
 #endif
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 6ade677fa..776b7c6ab 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -464,6 +464,15 @@ void HIPSpaceInitializer::print_configuration(std::ostream& msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<Kokkos::Experimental::HIP>::id;
+}
+}  // namespace Tools
+#endif
+
 }  // namespace Kokkos
 
 //==============================================================================
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
index acf2224f0..623c7da02 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -199,6 +199,15 @@ void HPXSpaceInitializer::print_configuration(std::ostream &msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<Kokkos::Experimental::HPX>::id;
+}
+}  // namespace Tools
+#endif
+
 }  // namespace Kokkos
 
 #else
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
index 6305a1fa5..0063b1cd1 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -260,6 +260,7 @@ template <>
 struct DeviceTypeTraits<Cuda> {
   /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling
   static constexpr DeviceType id = DeviceType::Cuda;
+  static int device_id(const Cuda& exec) { return exec.cuda_device(); }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
index 1371d21d3..68869a607 100644
--- a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -571,6 +571,9 @@ namespace Experimental {
 template <>
 struct DeviceTypeTraits<Kokkos::Experimental::HIP> {
   static constexpr DeviceType id = DeviceType::HIP;
+  static int device_id(const Kokkos::Experimental::HIP& exec) {
+    return exec.hip_device();
+  }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
index d2ae9c0ec..9238ca30a 100644
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ b/packages/kokkos/core/src/Kokkos_HPX.hpp
@@ -500,6 +500,7 @@ namespace Experimental {
 template <>
 struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
   static constexpr DeviceType id = DeviceType::HPX;
+  static int device_id(const Kokkos::Experimental::HPX &) { return 0; }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
index 5d76e689f..767e5b932 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -179,6 +179,7 @@ namespace Experimental {
 template <>
 struct DeviceTypeTraits<OpenMP> {
   static constexpr DeviceType id = DeviceType::OpenMP;
+  static int device_id(const OpenMP&) { return 0; }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index f394f3240..373dc3d9c 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -130,6 +130,9 @@ template <>
 struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> {
   static constexpr DeviceType id =
       ::Kokkos::Profiling::Experimental::DeviceType::OpenMPTarget;
+  static int device_id(const Kokkos::Experimental::OpenMPTarget&) {
+    return omp_get_default_device();
+  }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index 02095ff7b..e29093db3 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -182,6 +182,9 @@ template <>
 struct DeviceTypeTraits<Kokkos::Experimental::SYCL> {
   /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling
   static constexpr DeviceType id = DeviceType::SYCL;
+  static int device_id(const Kokkos::Experimental::SYCL& exec) {
+    return exec.sycl_device();
+  }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
index 9aada48bf..b2e524c37 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -226,6 +226,7 @@ namespace Experimental {
 template <>
 struct DeviceTypeTraits<Serial> {
   static constexpr DeviceType id = DeviceType::Serial;
+  static int device_id(const Serial&) { return 0; }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
index 45a2d0e32..5879209f1 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -175,6 +175,7 @@ namespace Experimental {
 template <>
 struct DeviceTypeTraits<Threads> {
   static constexpr DeviceType id = DeviceType::Threads;
+  static int device_id(const Threads&) { return 0; }
 };
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index d2283d456..66dbbacce 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -67,8 +67,9 @@ __thread int t_openmp_hardware_id            = 0;
 __thread Impl::OpenMPExec *t_openmp_instance = nullptr;
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
-void OpenMPExec::validate_partition(const int nthreads, int &num_partitions,
-                                    int &partition_size) {
+void OpenMPExec::validate_partition_impl(const int nthreads,
+                                         int &num_partitions,
+                                         int &partition_size) {
   if (nthreads == 1) {
     num_partitions = 1;
     partition_size = 1;
@@ -506,6 +507,15 @@ void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<OpenMP>::id;
+}
+}  // namespace Tools
+#endif
+
 }  // namespace Kokkos
 
 #else
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 2f647af77..ede24d109 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -93,7 +93,11 @@ class OpenMPExec {
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED static void validate_partition(const int nthreads,
                                                    int& num_partitions,
-                                                   int& partition_size);
+                                                   int& partition_size) {
+    validate_partition_impl(nthreads, num_partitions, partition_size);
+  }
+  static void validate_partition_impl(const int nthreads, int& num_partitions,
+                                      int& partition_size);
 #endif
 
  private:
@@ -179,8 +183,8 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
 
     Exec* prev_instance = Impl::t_openmp_instance;
 
-    Exec::validate_partition(prev_instance->m_pool_size, num_partitions,
-                             partition_size);
+    Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
+                                  partition_size);
 
     OpenMP::memory_space space;
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index 907e4e9ef..45aacd725 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -72,7 +72,7 @@ class SYCLInternal {
                                   bool force_shrink = false);
 
   uint32_t impl_get_instance_id() const;
-  int m_syclDev = -1;
+  int m_syclDev = 0;
 
   size_t m_maxWorkgroupSize   = 0;
   uint32_t m_maxConcurrency   = 0;
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 8a7c49871..9682564ee 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -399,27 +399,68 @@ bool ThreadsExec::wake() {
 
 //----------------------------------------------------------------------------
 
-void *ThreadsExec::root_reduce_scratch() {
-  return s_threads_process.reduce_memory();
-}
+void ThreadsExec::execute_resize_scratch_in_serial() {
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0;
 
-void ThreadsExec::execute_resize_scratch(ThreadsExec &exec, const void *) {
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
+  auto deallocate_scratch_memory = [](ThreadsExec &exec) {
+    if (exec.m_scratch) {
+      using Record =
+          Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
+      Record *const r = Record::get_record(exec.m_scratch);
+      exec.m_scratch  = nullptr;
+      Record::decrement(r);
+    }
+  };
+  if (s_threads_process.m_pool_base) {
+    for (unsigned i = s_thread_pool_size[0]; begin < i;) {
+      deallocate_scratch_memory(*s_threads_exec[--i]);
+    }
+  }
 
-  if (exec.m_scratch) {
-    Record *const r = Record::get_record(exec.m_scratch);
+  s_current_function     = &first_touch_allocate_thread_private_scratch;
+  s_current_function_arg = &s_threads_process;
 
-    exec.m_scratch = nullptr;
+  // Make sure function and arguments are written before activating threads.
+  memory_fence();
 
-    Record::decrement(r);
+  for (unsigned i = s_thread_pool_size[0]; begin < i;) {
+    ThreadsExec &th = *s_threads_exec[--i];
+
+    th.m_pool_state = ThreadsExec::Active;
+
+    wait_yield(th.m_pool_state, ThreadsExec::Active);
   }
 
+  if (s_threads_process.m_pool_base) {
+    deallocate_scratch_memory(s_threads_process);
+    s_threads_process.m_pool_state = ThreadsExec::Active;
+    first_touch_allocate_thread_private_scratch(s_threads_process, nullptr);
+    s_threads_process.m_pool_state = ThreadsExec::Inactive;
+  }
+
+  s_current_function_arg = nullptr;
+  s_current_function     = nullptr;
+
+  // Make sure function and arguments are cleared before proceeding.
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+void *ThreadsExec::root_reduce_scratch() {
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec,
+                                                              const void *) {
   exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end;
   exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end;
 
   if (s_threads_process.m_scratch_thread_end) {
     // Allocate tracked memory:
     {
+      using Record =
+          Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>;
       Record *const r =
           Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch",
                            s_threads_process.m_scratch_thread_end);
@@ -461,7 +502,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) {
     s_threads_process.m_scratch_reduce_end = reduce_size;
     s_threads_process.m_scratch_thread_end = reduce_size + thread_size;
 
-    execute_resize_scratch(s_threads_process, nullptr);
+    execute_resize_scratch_in_serial();
 
     s_threads_process.m_scratch = s_threads_exec[0]->m_scratch;
   }
@@ -845,6 +886,15 @@ void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<Threads>::id;
+}
+}  // namespace Tools
+#endif
+
 } /* namespace Kokkos */
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 561b1ce29..d17f417bb 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -123,12 +123,15 @@ class ThreadsExec {
   static void global_unlock();
   static void spawn();
 
-  static void execute_resize_scratch(ThreadsExec &, const void *);
+  static void first_touch_allocate_thread_private_scratch(ThreadsExec &,
+                                                          const void *);
   static void execute_sleep(ThreadsExec &, const void *);
 
   ThreadsExec(const ThreadsExec &);
   ThreadsExec &operator=(const ThreadsExec &);
 
+  static void execute_resize_scratch_in_serial();
+
  public:
   KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; }
   KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 4e0e81405..d52668205 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -118,11 +118,14 @@ template <typename ExecutionSpace>
 constexpr uint32_t device_id_root() {
   constexpr auto device_id =
       static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
-  return (device_id << num_instance_bits);
+  return (device_id << (num_instance_bits + num_device_bits));
 }
 template <typename ExecutionSpace>
 inline uint32_t device_id(ExecutionSpace const& space) noexcept {
-  return device_id_root<ExecutionSpace>() + space.impl_instance_id();
+  return device_id_root<ExecutionSpace>() +
+         (DeviceTypeTraits<ExecutionSpace>::device_id(space)
+          << num_instance_bits) +
+         space.impl_instance_id();
 }
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
index c49e838d8..e5917eb59 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -233,6 +233,15 @@ void SerialSpaceInitializer::print_configuration(std::ostream& msg,
 }
 
 }  // namespace Impl
+
+#ifdef KOKKOS_ENABLE_CXX14
+namespace Tools {
+namespace Experimental {
+constexpr DeviceType DeviceTypeTraits<Serial>::id;
+}
+}  // namespace Tools
+#endif
+
 }  // namespace Kokkos
 
 #else
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index 09f7af091..f606a3983 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -1005,15 +1005,15 @@ struct ViewOffset<
   /* Cardinality of the domain index space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type size() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   /* Span of the range space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type span() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const {
@@ -1026,23 +1026,24 @@ struct ViewOffset<
     return m_dim.N0;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const {
-    return m_dim.N0 * m_dim.N1;
+    return size_type(m_dim.N0) * m_dim.N1;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5;
   }
   KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6;
   }
 
   // Stride with [ rank ] value is the total length
@@ -1288,8 +1289,8 @@ struct ViewOffset<
   /* Cardinality of the domain index space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type size() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   /* Span of the range space */
@@ -1633,15 +1634,15 @@ struct ViewOffset<
   /* Cardinality of the domain index space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type size() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   /* Span of the range space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type span() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const {
@@ -1916,14 +1917,14 @@ struct ViewOffset<
   /* Cardinality of the domain index space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type size() const {
-    return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 *
-           m_dim.N6 * m_dim.N7;
+    return size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
+           m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
   /* Span of the range space */
   KOKKOS_INLINE_FUNCTION
   constexpr size_type span() const {
-    return size() > 0 ? m_dim.N0 * m_stride : 0;
+    return size() > 0 ? size_type(m_dim.N0) * m_stride : 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const {
@@ -2066,27 +2067,29 @@ struct ViewOffset<
                 stride(/* 2 <= rank */
                        m_dim.N1 *
                        (dimension_type::rank == 2
-                            ? 1
+                            ? size_t(1)
                             : m_dim.N2 *
                                   (dimension_type::rank == 3
-                                       ? 1
+                                       ? size_t(1)
                                        : m_dim.N3 *
                                              (dimension_type::rank == 4
-                                                  ? 1
+                                                  ? size_t(1)
                                                   : m_dim.N4 *
                                                         (dimension_type::rank ==
                                                                  5
-                                                             ? 1
+                                                             ? size_t(1)
                                                              : m_dim.N5 *
                                                                    (dimension_type::
                                                                                 rank ==
                                                                             6
-                                                                        ? 1
+                                                                        ? size_t(
+                                                                              1)
                                                                         : m_dim.N6 *
                                                                               (dimension_type::
                                                                                            rank ==
                                                                                        7
-                                                                                   ? 1
+                                                                                   ? size_t(
+                                                                                         1)
                                                                                    : m_dim
                                                                                          .N7)))))))) {
   }
@@ -2447,8 +2450,8 @@ struct ViewOffset<Dimension, Kokkos::LayoutStride, void> {
   constexpr size_type size() const {
     return dimension_type::rank == 0
                ? 1
-               : m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 *
-                     m_dim.N5 * m_dim.N6 * m_dim.N7;
+               : size_type(m_dim.N0) * m_dim.N1 * m_dim.N2 * m_dim.N3 *
+                     m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7;
   }
 
  private:
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
index fe3651886..972b1b6d9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp
@@ -91,6 +91,7 @@ struct ViewTracker {
 
   template <class RT, class... RP>
   KOKKOS_INLINE_FUNCTION void assign(const View<RT, RP...>& vt) noexcept {
+    if (this == reinterpret_cast<const ViewTracker*>(&vt.m_track)) return;
     KOKKOS_IF_ON_HOST((
         if (view_traits::is_managed && Kokkos::Impl::SharedAllocationRecord<
                                            void, void>::tracking_enabled()) {
@@ -102,6 +103,7 @@ struct ViewTracker {
 
   KOKKOS_INLINE_FUNCTION ViewTracker& operator=(
       const ViewTracker& rhs) noexcept {
+    if (this == &rhs) return *this;
     KOKKOS_IF_ON_HOST((
         if (view_traits::is_managed && Kokkos::Impl::SharedAllocationRecord<
                                            void, void>::tracking_enabled()) {
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index 21602be08..83efae617 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1087,6 +1087,20 @@ class TestViewAPI {
     dView4_unmanaged unmanaged_dx = dx;
     ASSERT_EQ(dx.use_count(), 1);
 
+    // Test self assignment
+#if defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wself-assign-overloaded"
+#endif
+    dx = dx;  // copy-assignment operator
+#if defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+    ASSERT_EQ(dx.use_count(), 1);
+    dx = reinterpret_cast<typename dView4::uniform_type &>(
+        dx);  // conversion assignment operator
+    ASSERT_EQ(dx.use_count(), 1);
+
     dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(
         dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3));
 
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index d4f484a53..d1d38022a 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -240,6 +240,35 @@ struct TestViewOverloadResolution {
 TEST(TEST_CATEGORY, view_overload_resolution) {
   TestViewOverloadResolution<TEST_EXECSPACE>::test_function_overload();
 }
+
+template <typename MemorySpace>
+struct TestViewAllocationLargeRank {
+  using ViewType = Kokkos::View<char********, MemorySpace>;
+
+  KOKKOS_FUNCTION void operator()(int) const {
+    size_t idx = v.extent(0) - 1;
+    auto& lhs  = v(idx, idx, idx, idx, idx, idx, idx, idx);
+    lhs        = 42;  // This is where it segfaulted
+  }
+
+  ViewType v;
+};
+
+TEST(TEST_CATEGORY, view_allocation_large_rank) {
+  using ExecutionSpace = typename TEST_EXECSPACE::execution_space;
+  using MemorySpace    = typename TEST_EXECSPACE::memory_space;
+  constexpr int dim    = 16;
+  using FunctorType    = TestViewAllocationLargeRank<MemorySpace>;
+  typename FunctorType::ViewType v("v", dim, dim, dim, dim, dim, dim, dim, dim);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, 1),
+                       FunctorType{v});
+  typename FunctorType::ViewType v_single(v.data() + v.size() - 1, 1, 1, 1, 1,
+                                          1, 1, 1, 1);
+  auto result =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, v_single);
+  ASSERT_EQ(result(0, 0, 0, 0, 0, 0, 0, 0), 42);
+}
 }  // namespace Test
 
 #include <TestViewIsAssignable.hpp>
diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
index 08863232e..bb1d3156f 100644
--- a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -238,13 +238,10 @@ TEST(kokkosp, test_id_gen) {
   using Kokkos::Tools::Experimental::DeviceTypeTraits;
   test_wrapper([&]() {
     Kokkos::DefaultExecutionSpace ex;
-    auto id      = device_id(ex);
-    auto id_ref  = identifier_from_devid(id);
-    auto success = (id_ref.instance_id == ex.impl_instance_id()) &&
-                   (id_ref.device_id ==
-                    static_cast<uint32_t>(
-                        DeviceTypeTraits<Kokkos::DefaultExecutionSpace>::id));
-    ASSERT_TRUE(success);
+    auto id     = device_id(ex);
+    auto id_ref = identifier_from_devid(id);
+    ASSERT_EQ(DeviceTypeTraits<decltype(ex)>::id, id_ref.type);
+    ASSERT_EQ(id_ref.instance_id, ex.impl_instance_id());
   });
 }
 
@@ -253,6 +250,7 @@ TEST(kokkosp, test_id_gen) {
  */
 TEST(kokkosp, test_kernel_sequence) {
   test_wrapper([&]() {
+    Kokkos::DefaultExecutionSpace ex;
     auto root = Kokkos::Tools::Experimental::device_id_root<
         Kokkos::DefaultExecutionSpace>();
     std::vector<FencePayload> expected{
@@ -260,11 +258,10 @@ TEST(kokkosp, test_kernel_sequence) {
         {"named_instance", FencePayload::distinguishable_devices::no,
          root + num_instances},
         {"test_kernel", FencePayload::distinguishable_devices::no,
-         root + num_instances}
+         Kokkos::Tools::Experimental::device_id(ex)}
 
     };
     expect_fence_events(expected, [=]() {
-      Kokkos::DefaultExecutionSpace ex;
       TestFunctor tf;
       ex.fence("named_instance");
       Kokkos::parallel_for(
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index e174b47f6..41c755a8a 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -27,3 +27,4 @@ tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
 tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
 tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
 tag:  3.6.00     date: 04:14:2022    master: 2834f94a    release: 6ea708ff
+tag:  3.6.01     date: 06:16:2022    master: b52f8c83    release: afe9b404
diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
index 1df32051b..e4ab07dbc 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
+++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
@@ -1,5 +1,7 @@
 FROM nvidia/cuda:10.1-devel
 
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
 RUN apt-get update && apt-get install -y \
         bc \
         git \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc
index c3e7a875f..cbae8e47d 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc
@@ -3,6 +3,8 @@ FROM $BASE
 
 ARG ADDITIONAL_PACKAGES
 
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
 RUN apt-get update && apt-get install -y \
         bc \
         wget \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
index 0599c6085..caeee1821 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget
+++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
@@ -1,4 +1,4 @@
-ARG BASE=nvidia/cuda:11.1-devel-ubuntu20.04
+ARG BASE=nvidia/cuda:11.1.1-devel-ubuntu20.04
 FROM $BASE
 
 RUN apt-get update && apt-get install -y \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index 8f08fe2e5..1cd700648 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -1,6 +1,8 @@
 ARG BASE=nvidia/cuda:10.2-devel
 FROM $BASE
 
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
 RUN apt-get update && apt-get install -y \
         bc \
         wget \
-- 
GitLab